From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 20:24:20 +0200 Subject: Adding upstream version 14.2.21. Signed-off-by: Daniel Baumann --- src/spdk/dpdk/lib/Makefile | 115 + src/spdk/dpdk/lib/librte_acl/Makefile | 67 + src/spdk/dpdk/lib/librte_acl/acl.h | 216 + src/spdk/dpdk/lib/librte_acl/acl_bld.c | 1569 +++++++ src/spdk/dpdk/lib/librte_acl/acl_gen.c | 532 +++ src/spdk/dpdk/lib/librte_acl/acl_run.h | 236 ++ src/spdk/dpdk/lib/librte_acl/acl_run_altivec.c | 47 + src/spdk/dpdk/lib/librte_acl/acl_run_altivec.h | 329 ++ src/spdk/dpdk/lib/librte_acl/acl_run_avx2.c | 25 + src/spdk/dpdk/lib/librte_acl/acl_run_avx2.h | 255 ++ src/spdk/dpdk/lib/librte_acl/acl_run_neon.c | 18 + src/spdk/dpdk/lib/librte_acl/acl_run_neon.h | 261 ++ src/spdk/dpdk/lib/librte_acl/acl_run_scalar.c | 163 + src/spdk/dpdk/lib/librte_acl/acl_run_sse.c | 18 + src/spdk/dpdk/lib/librte_acl/acl_run_sse.h | 328 ++ src/spdk/dpdk/lib/librte_acl/acl_vect.h | 87 + src/spdk/dpdk/lib/librte_acl/meson.build | 31 + src/spdk/dpdk/lib/librte_acl/rte_acl.c | 375 ++ src/spdk/dpdk/lib/librte_acl/rte_acl.h | 358 ++ src/spdk/dpdk/lib/librte_acl/rte_acl_osdep.h | 49 + src/spdk/dpdk/lib/librte_acl/rte_acl_version.map | 19 + src/spdk/dpdk/lib/librte_acl/tb_mem.c | 79 + src/spdk/dpdk/lib/librte_acl/tb_mem.h | 47 + src/spdk/dpdk/lib/librte_bbdev/Makefile | 29 + src/spdk/dpdk/lib/librte_bbdev/meson.build | 9 + src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.c | 1133 +++++ src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.h | 700 ++++ src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_op.h | 598 +++ src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h | 198 + .../dpdk/lib/librte_bbdev/rte_bbdev_version.map | 37 + src/spdk/dpdk/lib/librte_bitratestats/Makefile | 22 + src/spdk/dpdk/lib/librte_bitratestats/meson.build | 7 + .../dpdk/lib/librte_bitratestats/rte_bitrate.c | 130 + .../dpdk/lib/librte_bitratestats/rte_bitrate.h | 65 + .../rte_bitratestats_version.map | 9 + src/spdk/dpdk/lib/librte_bpf/Makefile | 41 + src/spdk/dpdk/lib/librte_bpf/bpf.c | 61 + src/spdk/dpdk/lib/librte_bpf/bpf_def.h | 143 + src/spdk/dpdk/lib/librte_bpf/bpf_exec.c | 453 ++ src/spdk/dpdk/lib/librte_bpf/bpf_impl.h | 55 + src/spdk/dpdk/lib/librte_bpf/bpf_jit_x86.c | 1356 ++++++ src/spdk/dpdk/lib/librte_bpf/bpf_load.c | 148 + src/spdk/dpdk/lib/librte_bpf/bpf_load_elf.c | 322 ++ src/spdk/dpdk/lib/librte_bpf/bpf_pkt.c | 605 +++ src/spdk/dpdk/lib/librte_bpf/bpf_validate.c | 2248 ++++++++++ src/spdk/dpdk/lib/librte_bpf/meson.build | 25 + src/spdk/dpdk/lib/librte_bpf/rte_bpf.h | 203 + src/spdk/dpdk/lib/librte_bpf/rte_bpf_ethdev.h | 117 + src/spdk/dpdk/lib/librte_bpf/rte_bpf_version.map | 16 + src/spdk/dpdk/lib/librte_cfgfile/Makefile | 27 + src/spdk/dpdk/lib/librte_cfgfile/meson.build | 6 + src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.c | 555 +++ src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.h | 355 ++ .../lib/librte_cfgfile/rte_cfgfile_version.map | 40 + src/spdk/dpdk/lib/librte_cmdline/Makefile | 37 + src/spdk/dpdk/lib/librte_cmdline/cmdline.c | 254 ++ src/spdk/dpdk/lib/librte_cmdline/cmdline.h | 62 + src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.c | 412 ++ src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.h | 193 + src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.c | 532 +++ src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.h | 191 + .../lib/librte_cmdline/cmdline_parse_etheraddr.c | 125 + .../lib/librte_cmdline/cmdline_parse_etheraddr.h | 42 + .../dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.c | 139 + .../dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.h | 135 + .../dpdk/lib/librte_cmdline/cmdline_parse_num.c | 348 ++ .../dpdk/lib/librte_cmdline/cmdline_parse_num.h | 61 + .../lib/librte_cmdline/cmdline_parse_portlist.c | 119 + .../lib/librte_cmdline/cmdline_parse_portlist.h | 50 + .../dpdk/lib/librte_cmdline/cmdline_parse_string.c | 220 + .../dpdk/lib/librte_cmdline/cmdline_parse_string.h | 73 + src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.c | 644 +++ src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.h | 201 + src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.c | 65 + src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.h | 25 + src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.c | 132 + src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.h | 100 + src/spdk/dpdk/lib/librte_cmdline/meson.build | 27 + .../lib/librte_cmdline/rte_cmdline_version.map | 77 + src/spdk/dpdk/lib/librte_compat/Makefile | 13 + src/spdk/dpdk/lib/librte_compat/meson.build | 8 + src/spdk/dpdk/lib/librte_compat/rte_compat.h | 89 + src/spdk/dpdk/lib/librte_compressdev/Makefile | 31 + src/spdk/dpdk/lib/librte_compressdev/meson.build | 12 + src/spdk/dpdk/lib/librte_compressdev/rte_comp.c | 215 + src/spdk/dpdk/lib/librte_compressdev/rte_comp.h | 485 +++ .../dpdk/lib/librte_compressdev/rte_compressdev.c | 772 ++++ .../dpdk/lib/librte_compressdev/rte_compressdev.h | 540 +++ .../librte_compressdev/rte_compressdev_internal.h | 114 + .../lib/librte_compressdev/rte_compressdev_pmd.c | 160 + .../lib/librte_compressdev/rte_compressdev_pmd.h | 390 ++ .../librte_compressdev/rte_compressdev_version.map | 39 + src/spdk/dpdk/lib/librte_cryptodev/Makefile | 31 + src/spdk/dpdk/lib/librte_cryptodev/meson.build | 11 + src/spdk/dpdk/lib/librte_cryptodev/rte_crypto.h | 445 ++ .../dpdk/lib/librte_cryptodev/rte_crypto_asym.h | 496 +++ .../dpdk/lib/librte_cryptodev/rte_crypto_sym.h | 740 ++++ src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.c | 1618 +++++++ src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.h | 1197 ++++++ .../dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.c | 165 + .../dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.h | 509 +++ .../lib/librte_cryptodev/rte_cryptodev_version.map | 108 + src/spdk/dpdk/lib/librte_distributor/Makefile | 30 + src/spdk/dpdk/lib/librte_distributor/meson.build | 11 + .../dpdk/lib/librte_distributor/rte_distributor.c | 658 +++ .../dpdk/lib/librte_distributor/rte_distributor.h | 241 ++ .../rte_distributor_match_generic.c | 15 + .../librte_distributor/rte_distributor_match_sse.c | 86 + .../librte_distributor/rte_distributor_private.h | 174 + .../lib/librte_distributor/rte_distributor_v1705.h | 61 + .../lib/librte_distributor/rte_distributor_v20.c | 401 ++ .../lib/librte_distributor/rte_distributor_v20.h | 218 + .../librte_distributor/rte_distributor_version.map | 29 + src/spdk/dpdk/lib/librte_eal/Makefile | 12 + .../dpdk/lib/librte_eal/bsdapp/BSDmakefile.meson | 43 + src/spdk/dpdk/lib/librte_eal/bsdapp/Makefile | 8 + src/spdk/dpdk/lib/librte_eal/bsdapp/eal/Makefile | 97 + src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal.c | 941 +++++ .../dpdk/lib/librte_eal/bsdapp/eal/eal_alarm.c | 314 ++ .../lib/librte_eal/bsdapp/eal/eal_alarm_private.h | 19 + .../dpdk/lib/librte_eal/bsdapp/eal/eal_cpuflags.c | 21 + .../dpdk/lib/librte_eal/bsdapp/eal/eal_debug.c | 92 + src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_dev.c | 21 + .../lib/librte_eal/bsdapp/eal/eal_hugepage_info.c | 156 + .../lib/librte_eal/bsdapp/eal/eal_interrupts.c | 561 +++ .../dpdk/lib/librte_eal/bsdapp/eal/eal_lcore.c | 52 + .../dpdk/lib/librte_eal/bsdapp/eal/eal_memalloc.c | 54 + .../dpdk/lib/librte_eal/bsdapp/eal/eal_memory.c | 517 +++ .../dpdk/lib/librte_eal/bsdapp/eal/eal_thread.c | 177 + .../dpdk/lib/librte_eal/bsdapp/eal/eal_timer.c | 64 + .../dpdk/lib/librte_eal/bsdapp/eal/meson.build | 20 + src/spdk/dpdk/lib/librte_eal/common/Makefile | 35 + .../lib/librte_eal/common/arch/arm/meson.build | 5 + .../lib/librte_eal/common/arch/arm/rte_cpuflags.c | 140 + .../lib/librte_eal/common/arch/arm/rte_cycles.c | 17 + .../librte_eal/common/arch/arm/rte_hypervisor.c | 11 + .../librte_eal/common/arch/ppc_64/rte_cpuflags.c | 137 + .../lib/librte_eal/common/arch/ppc_64/rte_cycles.c | 7 + .../librte_eal/common/arch/ppc_64/rte_hypervisor.c | 11 + .../lib/librte_eal/common/arch/x86/meson.build | 5 + .../lib/librte_eal/common/arch/x86/rte_cpuflags.c | 160 + .../lib/librte_eal/common/arch/x86/rte_cpuid.h | 19 + .../lib/librte_eal/common/arch/x86/rte_cycles.c | 123 + .../librte_eal/common/arch/x86/rte_hypervisor.c | 40 + .../lib/librte_eal/common/arch/x86/rte_memcpy.c | 29 + .../lib/librte_eal/common/arch/x86/rte_spinlock.c | 15 + .../dpdk/lib/librte_eal/common/eal_common_bus.c | 244 ++ .../dpdk/lib/librte_eal/common/eal_common_class.c | 64 + .../lib/librte_eal/common/eal_common_cpuflags.c | 50 + .../dpdk/lib/librte_eal/common/eal_common_dev.c | 566 +++ .../lib/librte_eal/common/eal_common_devargs.c | 414 ++ .../dpdk/lib/librte_eal/common/eal_common_errno.c | 47 + .../lib/librte_eal/common/eal_common_fbarray.c | 1243 ++++++ .../lib/librte_eal/common/eal_common_hexdump.c | 91 + .../lib/librte_eal/common/eal_common_hypervisor.c | 22 + .../dpdk/lib/librte_eal/common/eal_common_launch.c | 90 + .../dpdk/lib/librte_eal/common/eal_common_lcore.c | 134 + .../dpdk/lib/librte_eal/common/eal_common_log.c | 460 ++ .../lib/librte_eal/common/eal_common_memalloc.c | 364 ++ .../dpdk/lib/librte_eal/common/eal_common_memory.c | 584 +++ .../lib/librte_eal/common/eal_common_memzone.c | 408 ++ .../lib/librte_eal/common/eal_common_options.c | 1450 +++++++ .../dpdk/lib/librte_eal/common/eal_common_proc.c | 1181 ++++++ .../lib/librte_eal/common/eal_common_string_fns.c | 40 + .../dpdk/lib/librte_eal/common/eal_common_tailqs.c | 170 + .../dpdk/lib/librte_eal/common/eal_common_thread.c | 232 + .../dpdk/lib/librte_eal/common/eal_common_timer.c | 75 + .../dpdk/lib/librte_eal/common/eal_common_uuid.c | 193 + .../dpdk/lib/librte_eal/common/eal_filesystem.h | 115 + .../dpdk/lib/librte_eal/common/eal_hugepages.h | 40 + .../dpdk/lib/librte_eal/common/eal_internal_cfg.h | 80 + src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h | 82 + src/spdk/dpdk/lib/librte_eal/common/eal_options.h | 81 + src/spdk/dpdk/lib/librte_eal/common/eal_private.h | 307 ++ src/spdk/dpdk/lib/librte_eal/common/eal_thread.h | 71 + .../librte_eal/common/include/arch/arm/meson.build | 29 + .../common/include/arch/arm/rte_atomic.h | 14 + .../common/include/arch/arm/rte_atomic_32.h | 62 + .../common/include/arch/arm/rte_atomic_64.h | 47 + .../common/include/arch/arm/rte_byteorder.h | 81 + .../common/include/arch/arm/rte_cpuflags.h | 14 + .../common/include/arch/arm/rte_cpuflags_32.h | 54 + .../common/include/arch/arm/rte_cpuflags_64.h | 36 + .../common/include/arch/arm/rte_cycles.h | 14 + .../common/include/arch/arm/rte_cycles_32.h | 93 + .../common/include/arch/arm/rte_cycles_64.h | 76 + .../librte_eal/common/include/arch/arm/rte_io.h | 22 + .../librte_eal/common/include/arch/arm/rte_io_64.h | 171 + .../common/include/arch/arm/rte_memcpy.h | 14 + .../common/include/arch/arm/rte_memcpy_32.h | 305 ++ .../common/include/arch/arm/rte_memcpy_64.h | 372 ++ .../librte_eal/common/include/arch/arm/rte_pause.h | 22 + .../common/include/arch/arm/rte_pause_32.h | 23 + .../common/include/arch/arm/rte_pause_64.h | 24 + .../common/include/arch/arm/rte_prefetch.h | 14 + .../common/include/arch/arm/rte_prefetch_32.h | 40 + .../common/include/arch/arm/rte_prefetch_64.h | 39 + .../common/include/arch/arm/rte_rwlock.h | 42 + .../common/include/arch/arm/rte_spinlock.h | 64 + .../librte_eal/common/include/arch/arm/rte_vect.h | 178 + .../common/include/arch/ppc_64/rte_atomic.h | 470 +++ .../common/include/arch/ppc_64/rte_byteorder.h | 150 + .../common/include/arch/ppc_64/rte_cpuflags.h | 88 + .../common/include/arch/ppc_64/rte_cycles.h | 96 + .../librte_eal/common/include/arch/ppc_64/rte_io.h | 18 + .../common/include/arch/ppc_64/rte_memcpy.h | 226 + .../common/include/arch/ppc_64/rte_pause.h | 22 + .../common/include/arch/ppc_64/rte_prefetch.h | 68 + .../common/include/arch/ppc_64/rte_rwlock.h | 40 + .../common/include/arch/ppc_64/rte_spinlock.h | 115 + .../common/include/arch/ppc_64/rte_vect.h | 61 + .../librte_eal/common/include/arch/x86/meson.build | 21 + .../common/include/arch/x86/rte_atomic.h | 270 ++ .../common/include/arch/x86/rte_atomic_32.h | 243 ++ .../common/include/arch/x86/rte_atomic_64.h | 211 + .../common/include/arch/x86/rte_byteorder.h | 99 + .../common/include/arch/x86/rte_byteorder_32.h | 29 + .../common/include/arch/x86/rte_byteorder_64.h | 30 + .../common/include/arch/x86/rte_cpuflags.h | 125 + .../common/include/arch/x86/rte_cycles.h | 66 + .../librte_eal/common/include/arch/x86/rte_io.h | 18 + .../common/include/arch/x86/rte_memcpy.h | 876 ++++ .../librte_eal/common/include/arch/x86/rte_pause.h | 24 + .../common/include/arch/x86/rte_prefetch.h | 39 + .../librte_eal/common/include/arch/x86/rte_rtm.h | 73 + .../common/include/arch/x86/rte_rwlock.h | 53 + .../common/include/arch/x86/rte_spinlock.h | 168 + .../librte_eal/common/include/arch/x86/rte_vect.h | 95 + .../librte_eal/common/include/generic/rte_atomic.h | 1085 +++++ .../common/include/generic/rte_byteorder.h | 247 ++ .../common/include/generic/rte_cpuflags.h | 88 + .../librte_eal/common/include/generic/rte_cycles.h | 169 + .../lib/librte_eal/common/include/generic/rte_io.h | 350 ++ .../librte_eal/common/include/generic/rte_memcpy.h | 112 + .../librte_eal/common/include/generic/rte_pause.h | 23 + .../common/include/generic/rte_prefetch.h | 54 + .../librte_eal/common/include/generic/rte_rwlock.h | 180 + .../common/include/generic/rte_spinlock.h | 297 ++ .../librte_eal/common/include/generic/rte_vect.h | 186 + .../dpdk/lib/librte_eal/common/include/rte_alarm.h | 77 + .../lib/librte_eal/common/include/rte_bitmap.h | 533 +++ .../common/include/rte_branch_prediction.h | 41 + .../dpdk/lib/librte_eal/common/include/rte_bus.h | 339 ++ .../dpdk/lib/librte_eal/common/include/rte_class.h | 134 + .../lib/librte_eal/common/include/rte_common.h | 578 +++ .../dpdk/lib/librte_eal/common/include/rte_debug.h | 82 + .../dpdk/lib/librte_eal/common/include/rte_dev.h | 463 ++ .../lib/librte_eal/common/include/rte_devargs.h | 319 ++ .../dpdk/lib/librte_eal/common/include/rte_eal.h | 505 +++ .../librte_eal/common/include/rte_eal_interrupts.h | 222 + .../librte_eal/common/include/rte_eal_memconfig.h | 95 + .../dpdk/lib/librte_eal/common/include/rte_errno.h | 66 + .../lib/librte_eal/common/include/rte_fbarray.h | 470 +++ .../lib/librte_eal/common/include/rte_hexdump.h | 60 + .../lib/librte_eal/common/include/rte_hypervisor.h | 33 + .../lib/librte_eal/common/include/rte_interrupts.h | 92 + .../lib/librte_eal/common/include/rte_keepalive.h | 170 + .../lib/librte_eal/common/include/rte_launch.h | 148 + .../dpdk/lib/librte_eal/common/include/rte_lcore.h | 324 ++ .../dpdk/lib/librte_eal/common/include/rte_log.h | 354 ++ .../lib/librte_eal/common/include/rte_malloc.h | 330 ++ .../librte_eal/common/include/rte_malloc_heap.h | 32 + .../lib/librte_eal/common/include/rte_memory.h | 506 +++ .../lib/librte_eal/common/include/rte_memzone.h | 320 ++ .../common/include/rte_pci_dev_feature_defs.h | 16 + .../common/include/rte_pci_dev_features.h | 15 + .../lib/librte_eal/common/include/rte_per_lcore.h | 50 + .../lib/librte_eal/common/include/rte_random.h | 62 + .../lib/librte_eal/common/include/rte_reciprocal.h | 90 + .../lib/librte_eal/common/include/rte_service.h | 427 ++ .../common/include/rte_service_component.h | 129 + .../lib/librte_eal/common/include/rte_string_fns.h | 83 + .../dpdk/lib/librte_eal/common/include/rte_tailq.h | 140 + .../dpdk/lib/librte_eal/common/include/rte_test.h | 46 + .../dpdk/lib/librte_eal/common/include/rte_time.h | 101 + .../dpdk/lib/librte_eal/common/include/rte_uuid.h | 129 + .../lib/librte_eal/common/include/rte_version.h | 102 + .../dpdk/lib/librte_eal/common/include/rte_vfio.h | 367 ++ src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c | 643 +++ src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h | 188 + src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c | 1001 +++++ src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h | 56 + src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c | 743 ++++ src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h | 86 + src/spdk/dpdk/lib/librte_eal/common/meson.build | 100 + .../dpdk/lib/librte_eal/common/rte_keepalive.c | 162 + src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c | 237 ++ .../dpdk/lib/librte_eal/common/rte_reciprocal.c | 164 + src/spdk/dpdk/lib/librte_eal/common/rte_service.c | 892 ++++ src/spdk/dpdk/lib/librte_eal/linuxapp/Makefile | 11 + src/spdk/dpdk/lib/librte_eal/linuxapp/eal/Makefile | 115 + src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal.c | 1172 ++++++ .../dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c | 241 ++ .../lib/librte_eal/linuxapp/eal/eal_cpuflags.c | 84 + .../dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c | 92 + .../dpdk/lib/librte_eal/linuxapp/eal/eal_dev.c | 224 + .../librte_eal/linuxapp/eal/eal_hugepage_info.c | 525 +++ .../lib/librte_eal/linuxapp/eal/eal_interrupts.c | 1230 ++++++ .../dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c | 81 + .../dpdk/lib/librte_eal/linuxapp/eal/eal_log.c | 62 + .../lib/librte_eal/linuxapp/eal/eal_memalloc.c | 1363 ++++++ .../dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c | 2348 +++++++++++ .../dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c | 188 + .../dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c | 265 ++ .../dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c | 1916 +++++++++ .../dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h | 144 + .../lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 92 + .../linuxapp/eal/include/exec-env/rte_kni_common.h | 133 + .../dpdk/lib/librte_eal/linuxapp/eal/meson.build | 29 + src/spdk/dpdk/lib/librte_eal/meson.build | 32 + src/spdk/dpdk/lib/librte_eal/rte_eal_version.map | 338 ++ src/spdk/dpdk/lib/librte_efd/Makefile | 23 + src/spdk/dpdk/lib/librte_efd/meson.build | 6 + src/spdk/dpdk/lib/librte_efd/rte_efd.c | 1335 ++++++ src/spdk/dpdk/lib/librte_efd/rte_efd.h | 279 ++ src/spdk/dpdk/lib/librte_efd/rte_efd_arm64.h | 48 + src/spdk/dpdk/lib/librte_efd/rte_efd_version.map | 13 + src/spdk/dpdk/lib/librte_efd/rte_efd_x86.h | 57 + src/spdk/dpdk/lib/librte_ethdev/Makefile | 44 + src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.c | 135 + src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.h | 27 + src/spdk/dpdk/lib/librte_ethdev/meson.build | 27 + src/spdk/dpdk/lib/librte_ethdev/rte_dev_info.h | 49 + src/spdk/dpdk/lib/librte_ethdev/rte_eth_ctrl.h | 828 ++++ src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.c | 4425 ++++++++++++++++++++ src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.h | 4298 +++++++++++++++++++ src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_core.h | 625 +++ .../dpdk/lib/librte_ethdev/rte_ethdev_driver.h | 357 ++ src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_pci.h | 210 + src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_vdev.h | 84 + .../dpdk/lib/librte_ethdev/rte_ethdev_version.map | 255 ++ src/spdk/dpdk/lib/librte_ethdev/rte_flow.c | 635 +++ src/spdk/dpdk/lib/librte_ethdev/rte_flow.h | 2208 ++++++++++ src/spdk/dpdk/lib/librte_ethdev/rte_flow_driver.h | 184 + src/spdk/dpdk/lib/librte_ethdev/rte_mtr.c | 201 + src/spdk/dpdk/lib/librte_ethdev/rte_mtr.h | 730 ++++ src/spdk/dpdk/lib/librte_ethdev/rte_mtr_driver.h | 192 + src/spdk/dpdk/lib/librte_ethdev/rte_tm.c | 409 ++ src/spdk/dpdk/lib/librte_ethdev/rte_tm.h | 1965 +++++++++ src/spdk/dpdk/lib/librte_ethdev/rte_tm_driver.h | 337 ++ src/spdk/dpdk/lib/librte_eventdev/Makefile | 46 + src/spdk/dpdk/lib/librte_eventdev/meson.build | 27 + .../lib/librte_eventdev/rte_event_crypto_adapter.c | 1128 +++++ .../lib/librte_eventdev/rte_event_crypto_adapter.h | 575 +++ .../lib/librte_eventdev/rte_event_eth_rx_adapter.c | 2430 +++++++++++ .../lib/librte_eventdev/rte_event_eth_rx_adapter.h | 513 +++ src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.c | 184 + src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.h | 278 ++ .../lib/librte_eventdev/rte_event_timer_adapter.c | 1299 ++++++ .../lib/librte_eventdev/rte_event_timer_adapter.h | 766 ++++ .../librte_eventdev/rte_event_timer_adapter_pmd.h | 114 + src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.c | 1357 ++++++ src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.h | 1920 +++++++++ .../dpdk/lib/librte_eventdev/rte_eventdev_pmd.h | 901 ++++ .../lib/librte_eventdev/rte_eventdev_pmd_pci.h | 137 + .../lib/librte_eventdev/rte_eventdev_pmd_vdev.h | 108 + .../lib/librte_eventdev/rte_eventdev_version.map | 113 + src/spdk/dpdk/lib/librte_flow_classify/Makefile | 26 + src/spdk/dpdk/lib/librte_flow_classify/meson.build | 7 + .../lib/librte_flow_classify/rte_flow_classify.c | 682 +++ .../lib/librte_flow_classify/rte_flow_classify.h | 279 ++ .../librte_flow_classify/rte_flow_classify_parse.c | 535 +++ .../librte_flow_classify/rte_flow_classify_parse.h | 59 + .../rte_flow_classify_version.map | 13 + src/spdk/dpdk/lib/librte_gro/Makefile | 25 + src/spdk/dpdk/lib/librte_gro/gro_tcp4.c | 370 ++ src/spdk/dpdk/lib/librte_gro/gro_tcp4.h | 301 ++ src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.c | 494 +++ src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.h | 156 + src/spdk/dpdk/lib/librte_gro/meson.build | 6 + src/spdk/dpdk/lib/librte_gro/rte_gro.c | 321 ++ src/spdk/dpdk/lib/librte_gro/rte_gro.h | 189 + src/spdk/dpdk/lib/librte_gro/rte_gro_version.map | 12 + src/spdk/dpdk/lib/librte_gso/Makefile | 27 + src/spdk/dpdk/lib/librte_gso/gso_common.c | 124 + src/spdk/dpdk/lib/librte_gso/gso_common.h | 145 + src/spdk/dpdk/lib/librte_gso/gso_tcp4.c | 73 + src/spdk/dpdk/lib/librte_gso/gso_tcp4.h | 45 + src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.c | 97 + src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.h | 46 + src/spdk/dpdk/lib/librte_gso/gso_udp4.c | 81 + src/spdk/dpdk/lib/librte_gso/gso_udp4.h | 42 + src/spdk/dpdk/lib/librte_gso/meson.build | 7 + src/spdk/dpdk/lib/librte_gso/rte_gso.c | 95 + src/spdk/dpdk/lib/librte_gso/rte_gso.h | 123 + src/spdk/dpdk/lib/librte_gso/rte_gso_version.map | 7 + src/spdk/dpdk/lib/librte_hash/Makefile | 33 + src/spdk/dpdk/lib/librte_hash/meson.build | 16 + src/spdk/dpdk/lib/librte_hash/rte_cmp_arm64.h | 85 + src/spdk/dpdk/lib/librte_hash/rte_cmp_x86.h | 74 + src/spdk/dpdk/lib/librte_hash/rte_crc_arm64.h | 183 + src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.c | 1344 ++++++ src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.h | 198 + src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.c | 210 + src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.h | 360 ++ src/spdk/dpdk/lib/librte_hash/rte_hash.h | 470 +++ src/spdk/dpdk/lib/librte_hash/rte_hash_crc.h | 601 +++ src/spdk/dpdk/lib/librte_hash/rte_hash_version.map | 55 + src/spdk/dpdk/lib/librte_hash/rte_jhash.h | 414 ++ src/spdk/dpdk/lib/librte_hash/rte_thash.h | 257 ++ src/spdk/dpdk/lib/librte_ip_frag/Makefile | 29 + src/spdk/dpdk/lib/librte_ip_frag/ip_frag_common.h | 152 + .../dpdk/lib/librte_ip_frag/ip_frag_internal.c | 387 ++ src/spdk/dpdk/lib/librte_ip_frag/meson.build | 11 + src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag.h | 332 ++ .../dpdk/lib/librte_ip_frag/rte_ip_frag_common.c | 123 + .../lib/librte_ip_frag/rte_ip_frag_version.map | 20 + .../lib/librte_ip_frag/rte_ipv4_fragmentation.c | 185 + .../dpdk/lib/librte_ip_frag/rte_ipv4_reassembly.c | 163 + .../lib/librte_ip_frag/rte_ipv6_fragmentation.c | 178 + .../dpdk/lib/librte_ip_frag/rte_ipv6_reassembly.c | 204 + src/spdk/dpdk/lib/librte_jobstats/Makefile | 23 + src/spdk/dpdk/lib/librte_jobstats/meson.build | 5 + src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.c | 264 ++ src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.h | 307 ++ .../lib/librte_jobstats/rte_jobstats_version.map | 26 + src/spdk/dpdk/lib/librte_kni/Makefile | 22 + src/spdk/dpdk/lib/librte_kni/meson.build | 10 + src/spdk/dpdk/lib/librte_kni/rte_kni.c | 801 ++++ src/spdk/dpdk/lib/librte_kni/rte_kni.h | 240 ++ src/spdk/dpdk/lib/librte_kni/rte_kni_fifo.h | 73 + src/spdk/dpdk/lib/librte_kni/rte_kni_version.map | 17 + src/spdk/dpdk/lib/librte_kvargs/Makefile | 23 + src/spdk/dpdk/lib/librte_kvargs/meson.build | 11 + src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.c | 205 + src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.h | 186 + .../dpdk/lib/librte_kvargs/rte_kvargs_version.map | 18 + src/spdk/dpdk/lib/librte_latencystats/Makefile | 24 + src/spdk/dpdk/lib/librte_latencystats/meson.build | 6 + .../lib/librte_latencystats/rte_latencystats.c | 336 ++ .../lib/librte_latencystats/rte_latencystats.h | 128 + .../rte_latencystats_version.map | 11 + src/spdk/dpdk/lib/librte_lpm/Makefile | 31 + src/spdk/dpdk/lib/librte_lpm/meson.build | 9 + src/spdk/dpdk/lib/librte_lpm/rte_lpm.c | 1887 +++++++++ src/spdk/dpdk/lib/librte_lpm/rte_lpm.h | 470 +++ src/spdk/dpdk/lib/librte_lpm/rte_lpm6.c | 961 +++++ src/spdk/dpdk/lib/librte_lpm/rte_lpm6.h | 226 + src/spdk/dpdk/lib/librte_lpm/rte_lpm_altivec.h | 154 + src/spdk/dpdk/lib/librte_lpm/rte_lpm_neon.h | 154 + src/spdk/dpdk/lib/librte_lpm/rte_lpm_sse.h | 122 + src/spdk/dpdk/lib/librte_lpm/rte_lpm_version.map | 46 + src/spdk/dpdk/lib/librte_mbuf/Makefile | 22 + src/spdk/dpdk/lib/librte_mbuf/meson.build | 7 + src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.c | 463 ++ src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.h | 2248 ++++++++++ src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.c | 97 + src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.h | 95 + src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.c | 206 + src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.h | 758 ++++ src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_version.map | 47 + src/spdk/dpdk/lib/librte_member/Makefile | 24 + src/spdk/dpdk/lib/librte_member/meson.build | 6 + src/spdk/dpdk/lib/librte_member/rte_member.c | 305 ++ src/spdk/dpdk/lib/librte_member/rte_member.h | 490 +++ src/spdk/dpdk/lib/librte_member/rte_member_ht.c | 557 +++ src/spdk/dpdk/lib/librte_member/rte_member_ht.h | 65 + src/spdk/dpdk/lib/librte_member/rte_member_vbf.c | 321 ++ src/spdk/dpdk/lib/librte_member/rte_member_vbf.h | 53 + .../dpdk/lib/librte_member/rte_member_version.map | 16 + src/spdk/dpdk/lib/librte_member/rte_member_x86.h | 78 + src/spdk/dpdk/lib/librte_mempool/Makefile | 27 + src/spdk/dpdk/lib/librte_mempool/meson.build | 21 + src/spdk/dpdk/lib/librte_mempool/rte_mempool.c | 1290 ++++++ src/spdk/dpdk/lib/librte_mempool/rte_mempool.h | 1700 ++++++++ src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops.c | 179 + .../lib/librte_mempool/rte_mempool_ops_default.c | 70 + .../lib/librte_mempool/rte_mempool_version.map | 60 + src/spdk/dpdk/lib/librte_meter/Makefile | 29 + src/spdk/dpdk/lib/librte_meter/meson.build | 6 + src/spdk/dpdk/lib/librte_meter/rte_meter.c | 112 + src/spdk/dpdk/lib/librte_meter/rte_meter.h | 441 ++ .../dpdk/lib/librte_meter/rte_meter_version.map | 19 + src/spdk/dpdk/lib/librte_metrics/Makefile | 22 + src/spdk/dpdk/lib/librte_metrics/meson.build | 5 + src/spdk/dpdk/lib/librte_metrics/rte_metrics.c | 280 ++ src/spdk/dpdk/lib/librte_metrics/rte_metrics.h | 222 + .../lib/librte_metrics/rte_metrics_version.map | 13 + src/spdk/dpdk/lib/librte_net/Makefile | 25 + src/spdk/dpdk/lib/librte_net/meson.build | 19 + src/spdk/dpdk/lib/librte_net/net_crc_neon.h | 269 ++ src/spdk/dpdk/lib/librte_net/net_crc_sse.h | 334 ++ src/spdk/dpdk/lib/librte_net/rte_arp.c | 50 + src/spdk/dpdk/lib/librte_net/rte_arp.h | 74 + src/spdk/dpdk/lib/librte_net/rte_esp.h | 32 + src/spdk/dpdk/lib/librte_net/rte_ether.h | 418 ++ src/spdk/dpdk/lib/librte_net/rte_gre.h | 43 + src/spdk/dpdk/lib/librte_net/rte_icmp.h | 42 + src/spdk/dpdk/lib/librte_net/rte_ip.h | 428 ++ src/spdk/dpdk/lib/librte_net/rte_net.c | 496 +++ src/spdk/dpdk/lib/librte_net/rte_net.h | 203 + src/spdk/dpdk/lib/librte_net/rte_net_crc.c | 196 + src/spdk/dpdk/lib/librte_net/rte_net_crc.h | 71 + src/spdk/dpdk/lib/librte_net/rte_net_version.map | 21 + src/spdk/dpdk/lib/librte_net/rte_sctp.h | 37 + src/spdk/dpdk/lib/librte_net/rte_tcp.h | 42 + src/spdk/dpdk/lib/librte_net/rte_udp.h | 37 + src/spdk/dpdk/lib/librte_pci/Makefile | 21 + src/spdk/dpdk/lib/librte_pci/meson.build | 5 + src/spdk/dpdk/lib/librte_pci/rte_pci.c | 184 + src/spdk/dpdk/lib/librte_pci/rte_pci.h | 234 ++ src/spdk/dpdk/lib/librte_pci/rte_pci_version.map | 14 + src/spdk/dpdk/lib/librte_pdump/Makefile | 25 + src/spdk/dpdk/lib/librte_pdump/meson.build | 8 + src/spdk/dpdk/lib/librte_pdump/rte_pdump.c | 625 +++ src/spdk/dpdk/lib/librte_pdump/rte_pdump.h | 192 + .../dpdk/lib/librte_pdump/rte_pdump_version.map | 13 + src/spdk/dpdk/lib/librte_pipeline/Makefile | 31 + src/spdk/dpdk/lib/librte_pipeline/meson.build | 8 + src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.c | 1619 +++++++ src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.h | 848 ++++ .../lib/librte_pipeline/rte_pipeline_version.map | 75 + .../dpdk/lib/librte_pipeline/rte_port_in_action.c | 531 +++ .../dpdk/lib/librte_pipeline/rte_port_in_action.h | 301 ++ .../dpdk/lib/librte_pipeline/rte_table_action.c | 2386 +++++++++++ .../dpdk/lib/librte_pipeline/rte_table_action.h | 905 ++++ src/spdk/dpdk/lib/librte_port/Makefile | 57 + src/spdk/dpdk/lib/librte_port/meson.build | 28 + src/spdk/dpdk/lib/librte_port/rte_port.h | 234 ++ src/spdk/dpdk/lib/librte_port/rte_port_ethdev.c | 524 +++ src/spdk/dpdk/lib/librte_port/rte_port_ethdev.h | 76 + src/spdk/dpdk/lib/librte_port/rte_port_fd.c | 518 +++ src/spdk/dpdk/lib/librte_port/rte_port_fd.h | 76 + src/spdk/dpdk/lib/librte_port/rte_port_frag.c | 277 ++ src/spdk/dpdk/lib/librte_port/rte_port_frag.h | 72 + src/spdk/dpdk/lib/librte_port/rte_port_kni.c | 545 +++ src/spdk/dpdk/lib/librte_port/rte_port_kni.h | 95 + src/spdk/dpdk/lib/librte_port/rte_port_ras.c | 330 ++ src/spdk/dpdk/lib/librte_port/rte_port_ras.h | 61 + src/spdk/dpdk/lib/librte_port/rte_port_ring.c | 787 ++++ src/spdk/dpdk/lib/librte_port/rte_port_ring.h | 94 + src/spdk/dpdk/lib/librte_port/rte_port_sched.c | 294 ++ src/spdk/dpdk/lib/librte_port/rte_port_sched.h | 53 + .../dpdk/lib/librte_port/rte_port_source_sink.c | 619 +++ .../dpdk/lib/librte_port/rte_port_source_sink.h | 58 + src/spdk/dpdk/lib/librte_port/rte_port_version.map | 53 + src/spdk/dpdk/lib/librte_power/Makefile | 23 + src/spdk/dpdk/lib/librte_power/channel_commands.h | 83 + src/spdk/dpdk/lib/librte_power/guest_channel.c | 141 + src/spdk/dpdk/lib/librte_power/guest_channel.h | 75 + src/spdk/dpdk/lib/librte_power/meson.build | 9 + .../dpdk/lib/librte_power/power_acpi_cpufreq.c | 646 +++ .../dpdk/lib/librte_power/power_acpi_cpufreq.h | 219 + src/spdk/dpdk/lib/librte_power/power_common.h | 10 + src/spdk/dpdk/lib/librte_power/power_kvm_vm.c | 134 + src/spdk/dpdk/lib/librte_power/power_kvm_vm.h | 200 + src/spdk/dpdk/lib/librte_power/rte_power.c | 126 + src/spdk/dpdk/lib/librte_power/rte_power.h | 287 ++ .../dpdk/lib/librte_power/rte_power_version.map | 35 + src/spdk/dpdk/lib/librte_rawdev/Makefile | 27 + src/spdk/dpdk/lib/librte_rawdev/meson.build | 5 + src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.c | 552 +++ src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.h | 610 +++ src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_pmd.h | 627 +++ .../dpdk/lib/librte_rawdev/rte_rawdev_version.map | 35 + src/spdk/dpdk/lib/librte_reorder/Makefile | 23 + src/spdk/dpdk/lib/librte_reorder/meson.build | 6 + src/spdk/dpdk/lib/librte_reorder/rte_reorder.c | 381 ++ src/spdk/dpdk/lib/librte_reorder/rte_reorder.h | 153 + .../lib/librte_reorder/rte_reorder_version.map | 13 + src/spdk/dpdk/lib/librte_ring/Makefile | 24 + src/spdk/dpdk/lib/librte_ring/meson.build | 7 + src/spdk/dpdk/lib/librte_ring/rte_ring.c | 282 ++ src/spdk/dpdk/lib/librte_ring/rte_ring.h | 945 +++++ src/spdk/dpdk/lib/librte_ring/rte_ring_c11_mem.h | 163 + src/spdk/dpdk/lib/librte_ring/rte_ring_generic.h | 170 + src/spdk/dpdk/lib/librte_ring/rte_ring_version.map | 19 + src/spdk/dpdk/lib/librte_sched/Makefile | 33 + src/spdk/dpdk/lib/librte_sched/meson.build | 7 + src/spdk/dpdk/lib/librte_sched/rte_approx.c | 167 + src/spdk/dpdk/lib/librte_sched/rte_approx.h | 46 + src/spdk/dpdk/lib/librte_sched/rte_red.c | 129 + src/spdk/dpdk/lib/librte_sched/rte_red.h | 411 ++ src/spdk/dpdk/lib/librte_sched/rte_sched.c | 2242 ++++++++++ src/spdk/dpdk/lib/librte_sched/rte_sched.h | 447 ++ src/spdk/dpdk/lib/librte_sched/rte_sched_common.h | 101 + .../dpdk/lib/librte_sched/rte_sched_version.map | 37 + src/spdk/dpdk/lib/librte_security/Makefile | 28 + src/spdk/dpdk/lib/librte_security/meson.build | 7 + src/spdk/dpdk/lib/librte_security/rte_security.c | 139 + src/spdk/dpdk/lib/librte_security/rte_security.h | 543 +++ .../dpdk/lib/librte_security/rte_security_driver.h | 160 + .../lib/librte_security/rte_security_version.map | 16 + src/spdk/dpdk/lib/librte_table/Makefile | 59 + src/spdk/dpdk/lib/librte_table/meson.build | 29 + src/spdk/dpdk/lib/librte_table/rte_lru.h | 93 + src/spdk/dpdk/lib/librte_table/rte_lru_arm64.h | 60 + src/spdk/dpdk/lib/librte_table/rte_lru_x86.h | 103 + src/spdk/dpdk/lib/librte_table/rte_table.h | 272 ++ src/spdk/dpdk/lib/librte_table/rte_table_acl.c | 798 ++++ src/spdk/dpdk/lib/librte_table/rte_table_acl.h | 66 + src/spdk/dpdk/lib/librte_table/rte_table_array.c | 207 + src/spdk/dpdk/lib/librte_table/rte_table_array.h | 47 + src/spdk/dpdk/lib/librte_table/rte_table_hash.h | 106 + .../dpdk/lib/librte_table/rte_table_hash_cuckoo.c | 323 ++ .../dpdk/lib/librte_table/rte_table_hash_cuckoo.h | 57 + .../dpdk/lib/librte_table/rte_table_hash_ext.c | 1011 +++++ .../dpdk/lib/librte_table/rte_table_hash_key16.c | 1170 ++++++ .../dpdk/lib/librte_table/rte_table_hash_key32.c | 1203 ++++++ .../dpdk/lib/librte_table/rte_table_hash_key8.c | 1138 +++++ .../dpdk/lib/librte_table/rte_table_hash_lru.c | 959 +++++ src/spdk/dpdk/lib/librte_table/rte_table_lpm.c | 366 ++ src/spdk/dpdk/lib/librte_table/rte_table_lpm.h | 95 + .../dpdk/lib/librte_table/rte_table_lpm_ipv6.c | 368 ++ .../dpdk/lib/librte_table/rte_table_lpm_ipv6.h | 93 + src/spdk/dpdk/lib/librte_table/rte_table_stub.c | 92 + src/spdk/dpdk/lib/librte_table/rte_table_stub.h | 33 + .../dpdk/lib/librte_table/rte_table_version.map | 20 + src/spdk/dpdk/lib/librte_timer/Makefile | 22 + src/spdk/dpdk/lib/librte_timer/meson.build | 5 + src/spdk/dpdk/lib/librte_timer/rte_timer.c | 617 +++ src/spdk/dpdk/lib/librte_timer/rte_timer.h | 309 ++ .../dpdk/lib/librte_timer/rte_timer_version.map | 15 + src/spdk/dpdk/lib/librte_vhost/Makefile | 37 + src/spdk/dpdk/lib/librte_vhost/fd_man.c | 370 ++ src/spdk/dpdk/lib/librte_vhost/fd_man.h | 57 + src/spdk/dpdk/lib/librte_vhost/iotlb.c | 364 ++ src/spdk/dpdk/lib/librte_vhost/iotlb.h | 79 + src/spdk/dpdk/lib/librte_vhost/meson.build | 16 + src/spdk/dpdk/lib/librte_vhost/rte_vdpa.h | 87 + src/spdk/dpdk/lib/librte_vhost/rte_vhost.h | 653 +++ src/spdk/dpdk/lib/librte_vhost/rte_vhost_crypto.h | 109 + .../dpdk/lib/librte_vhost/rte_vhost_version.map | 85 + src/spdk/dpdk/lib/librte_vhost/socket.c | 1063 +++++ src/spdk/dpdk/lib/librte_vhost/vdpa.c | 115 + src/spdk/dpdk/lib/librte_vhost/vhost.c | 825 ++++ src/spdk/dpdk/lib/librte_vhost/vhost.h | 745 ++++ src/spdk/dpdk/lib/librte_vhost/vhost_crypto.c | 1372 ++++++ src/spdk/dpdk/lib/librte_vhost/vhost_user.c | 1958 +++++++++ src/spdk/dpdk/lib/librte_vhost/vhost_user.h | 152 + src/spdk/dpdk/lib/librte_vhost/virtio_crypto.h | 422 ++ src/spdk/dpdk/lib/librte_vhost/virtio_net.c | 1657 ++++++++ src/spdk/dpdk/lib/meson.build | 134 + 633 files changed, 179371 insertions(+) create mode 100644 src/spdk/dpdk/lib/Makefile create mode 100644 src/spdk/dpdk/lib/librte_acl/Makefile create mode 100644 src/spdk/dpdk/lib/librte_acl/acl.h create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_bld.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_gen.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run.h create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_altivec.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_altivec.h create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_avx2.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_avx2.h create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_neon.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_neon.h create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_scalar.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_sse.c create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_run_sse.h create mode 100644 src/spdk/dpdk/lib/librte_acl/acl_vect.h create mode 100644 src/spdk/dpdk/lib/librte_acl/meson.build create mode 100644 src/spdk/dpdk/lib/librte_acl/rte_acl.c create mode 100644 src/spdk/dpdk/lib/librte_acl/rte_acl.h create mode 100644 src/spdk/dpdk/lib/librte_acl/rte_acl_osdep.h create mode 100644 src/spdk/dpdk/lib/librte_acl/rte_acl_version.map create mode 100644 src/spdk/dpdk/lib/librte_acl/tb_mem.c create mode 100644 src/spdk/dpdk/lib/librte_acl/tb_mem.h create mode 100644 src/spdk/dpdk/lib/librte_bbdev/Makefile create mode 100644 src/spdk/dpdk/lib/librte_bbdev/meson.build create mode 100644 src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.c create mode 100644 src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.h create mode 100644 src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_op.h create mode 100644 src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h create mode 100644 src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_version.map create mode 100644 src/spdk/dpdk/lib/librte_bitratestats/Makefile create mode 100644 src/spdk/dpdk/lib/librte_bitratestats/meson.build create mode 100644 src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.c create mode 100644 src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.h create mode 100644 src/spdk/dpdk/lib/librte_bitratestats/rte_bitratestats_version.map create mode 100644 src/spdk/dpdk/lib/librte_bpf/Makefile create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_def.h create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_exec.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_impl.h create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_jit_x86.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_load.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_load_elf.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_pkt.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/bpf_validate.c create mode 100644 src/spdk/dpdk/lib/librte_bpf/meson.build create mode 100644 src/spdk/dpdk/lib/librte_bpf/rte_bpf.h create mode 100644 src/spdk/dpdk/lib/librte_bpf/rte_bpf_ethdev.h create mode 100644 src/spdk/dpdk/lib/librte_bpf/rte_bpf_version.map create mode 100644 src/spdk/dpdk/lib/librte_cfgfile/Makefile create mode 100644 src/spdk/dpdk/lib/librte_cfgfile/meson.build create mode 100644 src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.c create mode 100644 src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.h create mode 100644 src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map create mode 100644 src/spdk/dpdk/lib/librte_cmdline/Makefile create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.c create mode 100644 src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.h create mode 100644 src/spdk/dpdk/lib/librte_cmdline/meson.build create mode 100644 src/spdk/dpdk/lib/librte_cmdline/rte_cmdline_version.map create mode 100644 src/spdk/dpdk/lib/librte_compat/Makefile create mode 100644 src/spdk/dpdk/lib/librte_compat/meson.build create mode 100644 src/spdk/dpdk/lib/librte_compat/rte_compat.h create mode 100644 src/spdk/dpdk/lib/librte_compressdev/Makefile create mode 100644 src/spdk/dpdk/lib/librte_compressdev/meson.build create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_comp.c create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_comp.h create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.c create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.h create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_internal.h create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.c create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.h create mode 100644 src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_version.map create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/Makefile create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/meson.build create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_crypto.h create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_asym.h create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_sym.h create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.c create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.h create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.c create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.h create mode 100644 src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_version.map create mode 100644 src/spdk/dpdk/lib/librte_distributor/Makefile create mode 100644 src/spdk/dpdk/lib/librte_distributor/meson.build create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor.c create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor.h create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_generic.c create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_sse.c create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_private.h create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_v1705.h create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.c create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.h create mode 100644 src/spdk/dpdk/lib/librte_distributor/rte_distributor_version.map create mode 100644 src/spdk/dpdk/lib/librte_eal/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/BSDmakefile.meson create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm_private.h create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_cpuflags.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_debug.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_dev.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_interrupts.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_lcore.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memalloc.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memory.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_thread.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_timer.c create mode 100644 src/spdk/dpdk/lib/librte_eal/bsdapp/eal/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/common/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/arm/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cpuflags.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cycles.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_hypervisor.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cycles.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuflags.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuid.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cycles.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_hypervisor.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_memcpy.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_spinlock.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_options.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_private.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/eal_thread.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_byteorder.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_rwlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_spinlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_vect.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_byteorder.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cpuflags.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cycles.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_io.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_prefetch.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_spinlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_32.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_64.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cycles.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_io.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_pause.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_prefetch.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rtm.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rwlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_spinlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_vect.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_atomic.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cpuflags.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cycles.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_io.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_memcpy.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_pause.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_prefetch.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_rwlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_spinlock.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_vect.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_alarm.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_bitmap.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_branch_prediction.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_bus.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_class.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_common.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_debug.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_dev.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_devargs.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_eal.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_interrupts.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_memconfig.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_errno.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_fbarray.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_hexdump.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_hypervisor.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_interrupts.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_keepalive.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_launch.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_lcore.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_log.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc_heap.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_memory.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_memzone.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_features.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_per_lcore.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_random.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_reciprocal.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_service.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_service_component.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_string_fns.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_tailq.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_test.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_time.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_uuid.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_version.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/include/rte_vfio.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h create mode 100644 src/spdk/dpdk/lib/librte_eal/common/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c create mode 100644 src/spdk/dpdk/lib/librte_eal/common/rte_service.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_cpuflags.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_dev.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h create mode 100644 src/spdk/dpdk/lib/librte_eal/linuxapp/eal/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eal/rte_eal_version.map create mode 100644 src/spdk/dpdk/lib/librte_efd/Makefile create mode 100644 src/spdk/dpdk/lib/librte_efd/meson.build create mode 100644 src/spdk/dpdk/lib/librte_efd/rte_efd.c create mode 100644 src/spdk/dpdk/lib/librte_efd/rte_efd.h create mode 100644 src/spdk/dpdk/lib/librte_efd/rte_efd_arm64.h create mode 100644 src/spdk/dpdk/lib/librte_efd/rte_efd_version.map create mode 100644 src/spdk/dpdk/lib/librte_efd/rte_efd_x86.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/Makefile create mode 100644 src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.c create mode 100644 src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/meson.build create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_dev_info.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_eth_ctrl.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.c create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_core.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_driver.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_pci.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_vdev.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_version.map create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_flow.c create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_flow.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_flow_driver.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_mtr.c create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_mtr.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_mtr_driver.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_tm.c create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_tm.h create mode 100644 src/spdk/dpdk/lib/librte_ethdev/rte_tm_driver.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/Makefile create mode 100644 src/spdk/dpdk/lib/librte_eventdev/meson.build create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.c create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.c create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.c create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.c create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter_pmd.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.c create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_vdev.h create mode 100644 src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_version.map create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/Makefile create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/meson.build create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.c create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.h create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.c create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.h create mode 100644 src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_version.map create mode 100644 src/spdk/dpdk/lib/librte_gro/Makefile create mode 100644 src/spdk/dpdk/lib/librte_gro/gro_tcp4.c create mode 100644 src/spdk/dpdk/lib/librte_gro/gro_tcp4.h create mode 100644 src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.c create mode 100644 src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.h create mode 100644 src/spdk/dpdk/lib/librte_gro/meson.build create mode 100644 src/spdk/dpdk/lib/librte_gro/rte_gro.c create mode 100644 src/spdk/dpdk/lib/librte_gro/rte_gro.h create mode 100644 src/spdk/dpdk/lib/librte_gro/rte_gro_version.map create mode 100644 src/spdk/dpdk/lib/librte_gso/Makefile create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_common.c create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_common.h create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_tcp4.c create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_tcp4.h create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.c create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.h create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_udp4.c create mode 100644 src/spdk/dpdk/lib/librte_gso/gso_udp4.h create mode 100644 src/spdk/dpdk/lib/librte_gso/meson.build create mode 100644 src/spdk/dpdk/lib/librte_gso/rte_gso.c create mode 100644 src/spdk/dpdk/lib/librte_gso/rte_gso.h create mode 100644 src/spdk/dpdk/lib/librte_gso/rte_gso_version.map create mode 100644 src/spdk/dpdk/lib/librte_hash/Makefile create mode 100644 src/spdk/dpdk/lib/librte_hash/meson.build create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_cmp_arm64.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_cmp_x86.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_crc_arm64.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.c create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.c create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_hash.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_hash_crc.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_hash_version.map create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_jhash.h create mode 100644 src/spdk/dpdk/lib/librte_hash/rte_thash.h create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/Makefile create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/ip_frag_common.h create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/ip_frag_internal.c create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/meson.build create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag.h create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_common.c create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_version.map create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_fragmentation.c create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_reassembly.c create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_fragmentation.c create mode 100644 src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_reassembly.c create mode 100644 src/spdk/dpdk/lib/librte_jobstats/Makefile create mode 100644 src/spdk/dpdk/lib/librte_jobstats/meson.build create mode 100644 src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.c create mode 100644 src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.h create mode 100644 src/spdk/dpdk/lib/librte_jobstats/rte_jobstats_version.map create mode 100644 src/spdk/dpdk/lib/librte_kni/Makefile create mode 100644 src/spdk/dpdk/lib/librte_kni/meson.build create mode 100644 src/spdk/dpdk/lib/librte_kni/rte_kni.c create mode 100644 src/spdk/dpdk/lib/librte_kni/rte_kni.h create mode 100644 src/spdk/dpdk/lib/librte_kni/rte_kni_fifo.h create mode 100644 src/spdk/dpdk/lib/librte_kni/rte_kni_version.map create mode 100644 src/spdk/dpdk/lib/librte_kvargs/Makefile create mode 100644 src/spdk/dpdk/lib/librte_kvargs/meson.build create mode 100644 src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.c create mode 100644 src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.h create mode 100644 src/spdk/dpdk/lib/librte_kvargs/rte_kvargs_version.map create mode 100644 src/spdk/dpdk/lib/librte_latencystats/Makefile create mode 100644 src/spdk/dpdk/lib/librte_latencystats/meson.build create mode 100644 src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.c create mode 100644 src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.h create mode 100644 src/spdk/dpdk/lib/librte_latencystats/rte_latencystats_version.map create mode 100644 src/spdk/dpdk/lib/librte_lpm/Makefile create mode 100644 src/spdk/dpdk/lib/librte_lpm/meson.build create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm.c create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm.h create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm6.c create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm6.h create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm_altivec.h create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm_neon.h create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm_sse.h create mode 100644 src/spdk/dpdk/lib/librte_lpm/rte_lpm_version.map create mode 100644 src/spdk/dpdk/lib/librte_mbuf/Makefile create mode 100644 src/spdk/dpdk/lib/librte_mbuf/meson.build create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.c create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.h create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.c create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.h create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.c create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.h create mode 100644 src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_version.map create mode 100644 src/spdk/dpdk/lib/librte_member/Makefile create mode 100644 src/spdk/dpdk/lib/librte_member/meson.build create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member.c create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member.h create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member_ht.c create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member_ht.h create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member_vbf.c create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member_vbf.h create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member_version.map create mode 100644 src/spdk/dpdk/lib/librte_member/rte_member_x86.h create mode 100644 src/spdk/dpdk/lib/librte_mempool/Makefile create mode 100644 src/spdk/dpdk/lib/librte_mempool/meson.build create mode 100644 src/spdk/dpdk/lib/librte_mempool/rte_mempool.c create mode 100644 src/spdk/dpdk/lib/librte_mempool/rte_mempool.h create mode 100644 src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops.c create mode 100644 src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops_default.c create mode 100644 src/spdk/dpdk/lib/librte_mempool/rte_mempool_version.map create mode 100644 src/spdk/dpdk/lib/librte_meter/Makefile create mode 100644 src/spdk/dpdk/lib/librte_meter/meson.build create mode 100644 src/spdk/dpdk/lib/librte_meter/rte_meter.c create mode 100644 src/spdk/dpdk/lib/librte_meter/rte_meter.h create mode 100644 src/spdk/dpdk/lib/librte_meter/rte_meter_version.map create mode 100644 src/spdk/dpdk/lib/librte_metrics/Makefile create mode 100644 src/spdk/dpdk/lib/librte_metrics/meson.build create mode 100644 src/spdk/dpdk/lib/librte_metrics/rte_metrics.c create mode 100644 src/spdk/dpdk/lib/librte_metrics/rte_metrics.h create mode 100644 src/spdk/dpdk/lib/librte_metrics/rte_metrics_version.map create mode 100644 src/spdk/dpdk/lib/librte_net/Makefile create mode 100644 src/spdk/dpdk/lib/librte_net/meson.build create mode 100644 src/spdk/dpdk/lib/librte_net/net_crc_neon.h create mode 100644 src/spdk/dpdk/lib/librte_net/net_crc_sse.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_arp.c create mode 100644 src/spdk/dpdk/lib/librte_net/rte_arp.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_esp.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_ether.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_gre.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_icmp.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_ip.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_net.c create mode 100644 src/spdk/dpdk/lib/librte_net/rte_net.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_net_crc.c create mode 100644 src/spdk/dpdk/lib/librte_net/rte_net_crc.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_net_version.map create mode 100644 src/spdk/dpdk/lib/librte_net/rte_sctp.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_tcp.h create mode 100644 src/spdk/dpdk/lib/librte_net/rte_udp.h create mode 100644 src/spdk/dpdk/lib/librte_pci/Makefile create mode 100644 src/spdk/dpdk/lib/librte_pci/meson.build create mode 100644 src/spdk/dpdk/lib/librte_pci/rte_pci.c create mode 100644 src/spdk/dpdk/lib/librte_pci/rte_pci.h create mode 100644 src/spdk/dpdk/lib/librte_pci/rte_pci_version.map create mode 100644 src/spdk/dpdk/lib/librte_pdump/Makefile create mode 100644 src/spdk/dpdk/lib/librte_pdump/meson.build create mode 100644 src/spdk/dpdk/lib/librte_pdump/rte_pdump.c create mode 100644 src/spdk/dpdk/lib/librte_pdump/rte_pdump.h create mode 100644 src/spdk/dpdk/lib/librte_pdump/rte_pdump_version.map create mode 100644 src/spdk/dpdk/lib/librte_pipeline/Makefile create mode 100644 src/spdk/dpdk/lib/librte_pipeline/meson.build create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.c create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.h create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_pipeline_version.map create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.c create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.h create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_table_action.c create mode 100644 src/spdk/dpdk/lib/librte_pipeline/rte_table_action.h create mode 100644 src/spdk/dpdk/lib/librte_port/Makefile create mode 100644 src/spdk/dpdk/lib/librte_port/meson.build create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_ethdev.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_ethdev.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_fd.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_fd.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_frag.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_frag.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_kni.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_kni.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_ras.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_ras.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_ring.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_ring.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_sched.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_sched.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_source_sink.c create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_source_sink.h create mode 100644 src/spdk/dpdk/lib/librte_port/rte_port_version.map create mode 100644 src/spdk/dpdk/lib/librte_power/Makefile create mode 100644 src/spdk/dpdk/lib/librte_power/channel_commands.h create mode 100644 src/spdk/dpdk/lib/librte_power/guest_channel.c create mode 100644 src/spdk/dpdk/lib/librte_power/guest_channel.h create mode 100644 src/spdk/dpdk/lib/librte_power/meson.build create mode 100644 src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c create mode 100644 src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h create mode 100644 src/spdk/dpdk/lib/librte_power/power_common.h create mode 100644 src/spdk/dpdk/lib/librte_power/power_kvm_vm.c create mode 100644 src/spdk/dpdk/lib/librte_power/power_kvm_vm.h create mode 100644 src/spdk/dpdk/lib/librte_power/rte_power.c create mode 100644 src/spdk/dpdk/lib/librte_power/rte_power.h create mode 100644 src/spdk/dpdk/lib/librte_power/rte_power_version.map create mode 100644 src/spdk/dpdk/lib/librte_rawdev/Makefile create mode 100644 src/spdk/dpdk/lib/librte_rawdev/meson.build create mode 100644 src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.c create mode 100644 src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.h create mode 100644 src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_pmd.h create mode 100644 src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_version.map create mode 100644 src/spdk/dpdk/lib/librte_reorder/Makefile create mode 100644 src/spdk/dpdk/lib/librte_reorder/meson.build create mode 100644 src/spdk/dpdk/lib/librte_reorder/rte_reorder.c create mode 100644 src/spdk/dpdk/lib/librte_reorder/rte_reorder.h create mode 100644 src/spdk/dpdk/lib/librte_reorder/rte_reorder_version.map create mode 100644 src/spdk/dpdk/lib/librte_ring/Makefile create mode 100644 src/spdk/dpdk/lib/librte_ring/meson.build create mode 100644 src/spdk/dpdk/lib/librte_ring/rte_ring.c create mode 100644 src/spdk/dpdk/lib/librte_ring/rte_ring.h create mode 100644 src/spdk/dpdk/lib/librte_ring/rte_ring_c11_mem.h create mode 100644 src/spdk/dpdk/lib/librte_ring/rte_ring_generic.h create mode 100644 src/spdk/dpdk/lib/librte_ring/rte_ring_version.map create mode 100644 src/spdk/dpdk/lib/librte_sched/Makefile create mode 100644 src/spdk/dpdk/lib/librte_sched/meson.build create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_approx.c create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_approx.h create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_red.c create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_red.h create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_sched.c create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_sched.h create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_sched_common.h create mode 100644 src/spdk/dpdk/lib/librte_sched/rte_sched_version.map create mode 100644 src/spdk/dpdk/lib/librte_security/Makefile create mode 100644 src/spdk/dpdk/lib/librte_security/meson.build create mode 100644 src/spdk/dpdk/lib/librte_security/rte_security.c create mode 100644 src/spdk/dpdk/lib/librte_security/rte_security.h create mode 100644 src/spdk/dpdk/lib/librte_security/rte_security_driver.h create mode 100644 src/spdk/dpdk/lib/librte_security/rte_security_version.map create mode 100644 src/spdk/dpdk/lib/librte_table/Makefile create mode 100644 src/spdk/dpdk/lib/librte_table/meson.build create mode 100644 src/spdk/dpdk/lib/librte_table/rte_lru.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_lru_arm64.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_lru_x86.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_acl.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_acl.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_array.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_array.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_ext.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_key16.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_key32.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_key8.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_hash_lru.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_lpm.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_lpm.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_stub.c create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_stub.h create mode 100644 src/spdk/dpdk/lib/librte_table/rte_table_version.map create mode 100644 src/spdk/dpdk/lib/librte_timer/Makefile create mode 100644 src/spdk/dpdk/lib/librte_timer/meson.build create mode 100644 src/spdk/dpdk/lib/librte_timer/rte_timer.c create mode 100644 src/spdk/dpdk/lib/librte_timer/rte_timer.h create mode 100644 src/spdk/dpdk/lib/librte_timer/rte_timer_version.map create mode 100644 src/spdk/dpdk/lib/librte_vhost/Makefile create mode 100644 src/spdk/dpdk/lib/librte_vhost/fd_man.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/fd_man.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/iotlb.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/iotlb.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/meson.build create mode 100644 src/spdk/dpdk/lib/librte_vhost/rte_vdpa.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/rte_vhost.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/rte_vhost_crypto.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/rte_vhost_version.map create mode 100644 src/spdk/dpdk/lib/librte_vhost/socket.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/vdpa.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/vhost.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/vhost.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/vhost_crypto.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/vhost_user.c create mode 100644 src/spdk/dpdk/lib/librte_vhost/vhost_user.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/virtio_crypto.h create mode 100644 src/spdk/dpdk/lib/librte_vhost/virtio_net.c create mode 100644 src/spdk/dpdk/lib/meson.build (limited to 'src/spdk/dpdk/lib') diff --git a/src/spdk/dpdk/lib/Makefile b/src/spdk/dpdk/lib/Makefile new file mode 100644 index 00000000..afa604e2 --- /dev/null +++ b/src/spdk/dpdk/lib/Makefile @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-y += librte_compat +DIRS-$(CONFIG_RTE_LIBRTE_KVARGS) += librte_kvargs +DEPDIRS-librte_kvargs := librte_compat +DIRS-$(CONFIG_RTE_LIBRTE_EAL) += librte_eal +DEPDIRS-librte_eal := librte_kvargs +DIRS-$(CONFIG_RTE_LIBRTE_PCI) += librte_pci +DEPDIRS-librte_pci := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_RING) += librte_ring +DEPDIRS-librte_ring := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += librte_mempool +DEPDIRS-librte_mempool := librte_eal librte_ring +DIRS-$(CONFIG_RTE_LIBRTE_MBUF) += librte_mbuf +DEPDIRS-librte_mbuf := librte_eal librte_mempool +DIRS-$(CONFIG_RTE_LIBRTE_TIMER) += librte_timer +DEPDIRS-librte_timer := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_CFGFILE) += librte_cfgfile +DIRS-$(CONFIG_RTE_LIBRTE_CMDLINE) += librte_cmdline +DEPDIRS-librte_cmdline := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_ETHER) += librte_ethdev +DEPDIRS-librte_ethdev := librte_net librte_eal librte_mempool librte_ring +DEPDIRS-librte_ethdev += librte_mbuf +DEPDIRS-librte_ethdev += librte_kvargs +DIRS-$(CONFIG_RTE_LIBRTE_BBDEV) += librte_bbdev +DEPDIRS-librte_bbdev := librte_eal librte_mempool librte_mbuf +DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += librte_cryptodev +DEPDIRS-librte_cryptodev := librte_eal librte_mempool librte_ring librte_mbuf +DEPDIRS-librte_cryptodev += librte_kvargs +DIRS-$(CONFIG_RTE_LIBRTE_SECURITY) += librte_security +DEPDIRS-librte_security := librte_eal librte_mempool librte_ring librte_mbuf +DEPDIRS-librte_security += librte_ethdev +DEPDIRS-librte_security += librte_cryptodev +DIRS-$(CONFIG_RTE_LIBRTE_COMPRESSDEV) += librte_compressdev +DEPDIRS-librte_compressdev := librte_eal librte_mempool librte_ring librte_mbuf +DEPDIRS-librte_compressdev += librte_kvargs +DIRS-$(CONFIG_RTE_LIBRTE_EVENTDEV) += librte_eventdev +DEPDIRS-librte_eventdev := librte_eal librte_ring librte_ethdev librte_hash \ + librte_mempool librte_timer librte_cryptodev +DIRS-$(CONFIG_RTE_LIBRTE_RAWDEV) += librte_rawdev +DEPDIRS-librte_rawdev := librte_eal librte_ethdev +DIRS-$(CONFIG_RTE_LIBRTE_VHOST) += librte_vhost +DEPDIRS-librte_vhost := librte_eal librte_mempool librte_mbuf librte_ethdev \ + librte_net +DIRS-$(CONFIG_RTE_LIBRTE_HASH) += librte_hash +DEPDIRS-librte_hash := librte_eal librte_ring +DIRS-$(CONFIG_RTE_LIBRTE_EFD) += librte_efd +DEPDIRS-librte_efd := librte_eal librte_ring librte_hash +DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm +DEPDIRS-librte_lpm := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_ACL) += librte_acl +DEPDIRS-librte_acl := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_MEMBER) += librte_member +DEPDIRS-librte_member := librte_eal librte_hash +DIRS-$(CONFIG_RTE_LIBRTE_NET) += librte_net +DEPDIRS-librte_net := librte_mbuf librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += librte_ip_frag +DEPDIRS-librte_ip_frag := librte_eal librte_mempool librte_mbuf librte_ethdev +DEPDIRS-librte_ip_frag += librte_hash +DIRS-$(CONFIG_RTE_LIBRTE_GRO) += librte_gro +DEPDIRS-librte_gro := librte_eal librte_mbuf librte_ethdev librte_net +DIRS-$(CONFIG_RTE_LIBRTE_JOBSTATS) += librte_jobstats +DEPDIRS-librte_jobstats := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_METRICS) += librte_metrics +DEPDIRS-librte_metrics := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_BITRATE) += librte_bitratestats +DEPDIRS-librte_bitratestats := librte_eal librte_metrics librte_ethdev +DIRS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) += librte_latencystats +DEPDIRS-librte_latencystats := librte_eal librte_metrics librte_ethdev librte_mbuf +DIRS-$(CONFIG_RTE_LIBRTE_POWER) += librte_power +DEPDIRS-librte_power := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_METER) += librte_meter +DEPDIRS-librte_meter := librte_eal +DIRS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += librte_flow_classify +DEPDIRS-librte_flow_classify := librte_net librte_table librte_acl +DIRS-$(CONFIG_RTE_LIBRTE_SCHED) += librte_sched +DEPDIRS-librte_sched := librte_eal librte_mempool librte_mbuf librte_net +DEPDIRS-librte_sched += librte_timer +DIRS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += librte_distributor +DEPDIRS-librte_distributor := librte_eal librte_mbuf librte_ethdev +DIRS-$(CONFIG_RTE_LIBRTE_PORT) += librte_port +DEPDIRS-librte_port := librte_eal librte_mempool librte_mbuf librte_ethdev +DEPDIRS-librte_port += librte_ip_frag librte_sched +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +DEPDIRS-librte_port += librte_kni +endif +DIRS-$(CONFIG_RTE_LIBRTE_TABLE) += librte_table +DEPDIRS-librte_table := librte_eal librte_mempool librte_mbuf +DEPDIRS-librte_table += librte_port librte_lpm librte_hash +ifeq ($(CONFIG_RTE_LIBRTE_ACL),y) +DEPDIRS-librte_table += librte_acl +endif +DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += librte_pipeline +DEPDIRS-librte_pipeline := librte_eal librte_mempool librte_mbuf +DEPDIRS-librte_pipeline += librte_table librte_port +DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder +DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf +DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump +DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ethdev +DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso +DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ethdev librte_net +DEPDIRS-librte_gso += librte_mempool +DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf +DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ethdev + +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) +DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni +endif +DEPDIRS-librte_kni := librte_eal librte_mempool librte_mbuf librte_ethdev +DEPDIRS-librte_kni += librte_pci + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/lib/librte_acl/Makefile b/src/spdk/dpdk/lib/librte_acl/Makefile new file mode 100644 index 00000000..ea5edf00 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/Makefile @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_acl.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_acl_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += tb_mem.c + +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c + +ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),) +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c +CFLAGS_acl_run_neon.o += -flax-vector-conversions +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_acl_run_neon.o += -Wno-maybe-uninitialized +endif +else ifeq ($(CONFIG_RTE_ARCH_PPC_64),y) +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_altivec.c +else +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c +endif + +# +# If the compiler supports AVX2 instructions, +# then add support for AVX2 classify method. +# + +#check if flag for AVX2 is already on, if not set it up manually +ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2) + CC_AVX2_SUPPORT=1 +else + CC_AVX2_SUPPORT=\ + $(shell $(CC) -march=core-avx2 -dM -E - &1 | \ + grep -q AVX2 && echo 1) + ifeq ($(CC_AVX2_SUPPORT), 1) + ifeq ($(CONFIG_RTE_TOOLCHAIN_ICC),y) + CFLAGS_acl_run_avx2.o += -march=core-avx2 + else + CFLAGS_acl_run_avx2.o += -mavx2 + endif + endif +endif + +ifeq ($(CC_AVX2_SUPPORT), 1) + SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_avx2.c + CFLAGS_rte_acl.o += -DCC_AVX2_SUPPORT +endif + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include := rte_acl_osdep.h +SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_acl/acl.h b/src/spdk/dpdk/lib/librte_acl/acl.h new file mode 100644 index 00000000..39d45a0c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _ACL_H_ +#define _ACL_H_ + +#ifdef __cplusplus +extern"C" { +#endif /* __cplusplus */ + +#define RTE_ACL_QUAD_MAX 5 +#define RTE_ACL_QUAD_SIZE 4 +#define RTE_ACL_QUAD_SINGLE UINT64_C(0x7f7f7f7f00000000) + +#define RTE_ACL_SINGLE_TRIE_SIZE 2000 + +#define RTE_ACL_DFA_MAX UINT8_MAX +#define RTE_ACL_DFA_SIZE (UINT8_MAX + 1) + +#define RTE_ACL_DFA_GR64_SIZE 64 +#define RTE_ACL_DFA_GR64_NUM (RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE) +#define RTE_ACL_DFA_GR64_BIT \ + (CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM) + +typedef int bits_t; + +#define RTE_ACL_BIT_SET_SIZE ((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT)) + +struct rte_acl_bitset { + bits_t bits[RTE_ACL_BIT_SET_SIZE]; +}; + +#define RTE_ACL_NODE_DFA (0 << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_SINGLE (1U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_QRANGE (3U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_MATCH (4U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_TYPE (7U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_UNDEFINED UINT32_MAX + +/* + * ACL RT structure is a set of multibit tries (with stride == 8) + * represented by an array of transitions. The next position is calculated + * based on the current position and the input byte. + * Each transition is 64 bit value with the following format: + * | node_type_specific : 32 | node_type : 3 | node_addr : 29 | + * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index + * to the start of the node in the transtions array. + * Few different node types are used: + * RTE_ACL_NODE_MATCH: + * node_addr value is and index into an array that contains the return value + * and its priority for each category. + * Upper 32 bits of the transition value are not used for that node type. + * RTE_ACL_NODE_QRANGE: + * that node consist of up to 5 transitions. + * Upper 32 bits are interpreted as 4 signed character values which + * are ordered from smallest(INT8_MIN) to largest (INT8_MAX). + * These values define 5 ranges: + * INT8_MIN <= range[0] <= ((int8_t *)&transition)[4] + * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5] + * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6] + * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7] + * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX + * So for input byte value within range[i] i-th transition within that node + * will be used. + * RTE_ACL_NODE_SINGLE: + * always transitions to the same node regardless of the input value. + * RTE_ACL_NODE_DFA: + * that node consits of up to 256 transitions. + * In attempt to conserve space all transitions are divided into 4 consecutive + * groups, by 64 transitions per group: + * group64[i] contains transitions[i * 64, .. i * 64 + 63]. + * Upper 32 bits are interpreted as 4 unsigned character values one per group, + * which contain index to the start of the given group within the node. + * So to calculate transition index within the node for given input byte value: + * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64]. + */ + +/* + * Structure of a node is a set of ptrs and each ptr has a bit map + * of values associated with this transition. + */ +struct rte_acl_ptr_set { + struct rte_acl_bitset values; /* input values associated with ptr */ + struct rte_acl_node *ptr; /* transition to next node */ +}; + +struct rte_acl_classifier_results { + int results[RTE_ACL_MAX_CATEGORIES]; +}; + +struct rte_acl_match_results { + uint32_t results[RTE_ACL_MAX_CATEGORIES]; + int32_t priority[RTE_ACL_MAX_CATEGORIES]; +}; + +struct rte_acl_node { + uint64_t node_index; /* index for this node */ + uint32_t level; /* level 0-n in the trie */ + uint32_t ref_count; /* ref count for this node */ + struct rte_acl_bitset values; + /* set of all values that map to another node + * (union of bits in each transition. + */ + uint32_t num_ptrs; /* number of ptr_set in use */ + uint32_t max_ptrs; /* number of allocated ptr_set */ + uint32_t min_add; /* number of ptr_set per allocation */ + struct rte_acl_ptr_set *ptrs; /* transitions array for this node */ + int32_t match_flag; + int32_t match_index; /* index to match data */ + uint32_t node_type; + int32_t fanout; + /* number of ranges (transitions w/ consecutive bits) */ + int32_t id; + struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */ + union { + char transitions[RTE_ACL_QUAD_SIZE]; + /* boundaries for ranged node */ + uint8_t dfa_gr64[RTE_ACL_DFA_GR64_NUM]; + }; + struct rte_acl_node *next; + /* free list link or pointer to duplicate node during merge */ + struct rte_acl_node *prev; + /* points to node from which this node was duplicated */ +}; + +/* + * Types of tries used to generate runtime structure(s) + */ +enum { + RTE_ACL_FULL_TRIE = 0, + RTE_ACL_NOSRC_TRIE = 1, + RTE_ACL_NODST_TRIE = 2, + RTE_ACL_NOPORTS_TRIE = 4, + RTE_ACL_NOVLAN_TRIE = 8, + RTE_ACL_UNUSED_TRIE = 0x80000000 +}; + + +/** MAX number of tries per one ACL context.*/ +#define RTE_ACL_MAX_TRIES 8 + +/** Max number of characters in PM name.*/ +#define RTE_ACL_NAMESIZE 32 + + +struct rte_acl_trie { + uint32_t type; + uint32_t count; + uint32_t root_index; + const uint32_t *data_index; + uint32_t num_data_indexes; +}; + +struct rte_acl_bld_trie { + struct rte_acl_node *trie; +}; + +struct rte_acl_ctx { + char name[RTE_ACL_NAMESIZE]; + /** Name of the ACL context. */ + int32_t socket_id; + /** Socket ID to allocate memory from. */ + enum rte_acl_classify_alg alg; + void *rules; + uint32_t max_rules; + uint32_t rule_sz; + uint32_t num_rules; + uint32_t num_categories; + uint32_t num_tries; + uint32_t match_index; + uint64_t no_match; + uint64_t idle; + uint64_t *trans_table; + uint32_t *data_indexes; + struct rte_acl_trie trie[RTE_ACL_MAX_TRIES]; + void *mem; + size_t mem_sz; + struct rte_acl_config config; /* copy of build config. */ +}; + +int rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, + struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, + uint32_t num_categories, uint32_t data_index_sz, size_t max_size); + +typedef int (*rte_acl_classify_t) +(const struct rte_acl_ctx *, const uint8_t **, uint32_t *, uint32_t, uint32_t); + +/* + * Different implementations of ACL classify. + */ +int +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_altivec(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* _ACL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_acl/acl_bld.c b/src/spdk/dpdk/lib/librte_acl/acl_bld.c new file mode 100644 index 00000000..b82191f4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_bld.c @@ -0,0 +1,1569 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include "tb_mem.h" +#include "acl.h" + +#define ACL_POOL_ALIGN 8 +#define ACL_POOL_ALLOC_MIN 0x800000 + +/* number of pointers per alloc */ +#define ACL_PTR_ALLOC 32 + +/* macros for dividing rule sets heuristics */ +#define NODE_MAX 0x4000 +#define NODE_MIN 0x800 + +/* TALLY are statistics per field */ +enum { + TALLY_0 = 0, /* number of rules that are 0% or more wild. */ + TALLY_25, /* number of rules that are 25% or more wild. */ + TALLY_50, + TALLY_75, + TALLY_100, + TALLY_DEACTIVATED, /* deactivated fields (100% wild in all rules). */ + TALLY_DEPTH, + /* number of rules that are 100% wild for this field and higher. */ + TALLY_NUM +}; + +static const uint32_t wild_limits[TALLY_DEACTIVATED] = {0, 25, 50, 75, 100}; + +enum { + ACL_INTERSECT_NONE = 0, + ACL_INTERSECT_A = 1, /* set A is a superset of A and B intersect */ + ACL_INTERSECT_B = 2, /* set B is a superset of A and B intersect */ + ACL_INTERSECT = 4, /* sets A and B intersect */ +}; + +enum { + ACL_PRIORITY_EQUAL = 0, + ACL_PRIORITY_NODE_A = 1, + ACL_PRIORITY_NODE_B = 2, + ACL_PRIORITY_MIXED = 3 +}; + + +struct acl_mem_block { + uint32_t block_size; + void *mem_ptr; +}; + +#define MEM_BLOCK_NUM 16 + +/* Single ACL rule, build representation.*/ +struct rte_acl_build_rule { + struct rte_acl_build_rule *next; + struct rte_acl_config *config; + /**< configuration for each field in the rule. */ + const struct rte_acl_rule *f; + uint32_t *wildness; +}; + +/* Context for build phase */ +struct acl_build_context { + const struct rte_acl_ctx *acx; + struct rte_acl_build_rule *build_rules; + struct rte_acl_config cfg; + int32_t node_max; + int32_t cur_node_max; + uint32_t node; + uint32_t num_nodes; + uint32_t category_mask; + uint32_t num_rules; + uint32_t node_id; + uint32_t src_mask; + uint32_t num_build_rules; + uint32_t num_tries; + struct tb_mem_pool pool; + struct rte_acl_trie tries[RTE_ACL_MAX_TRIES]; + struct rte_acl_bld_trie bld_tries[RTE_ACL_MAX_TRIES]; + uint32_t data_indexes[RTE_ACL_MAX_TRIES][RTE_ACL_MAX_FIELDS]; + + /* memory free lists for nodes and blocks used for node ptrs */ + struct acl_mem_block blocks[MEM_BLOCK_NUM]; + struct rte_acl_node *node_free_list; +}; + +static int acl_merge_trie(struct acl_build_context *context, + struct rte_acl_node *node_a, struct rte_acl_node *node_b, + uint32_t level, struct rte_acl_node **node_c); + +static void +acl_deref_ptr(struct acl_build_context *context, + struct rte_acl_node *node, int index); + +static void * +acl_build_alloc(struct acl_build_context *context, size_t n, size_t s) +{ + uint32_t m; + void *p; + size_t alloc_size = n * s; + + /* + * look for memory in free lists + */ + for (m = 0; m < RTE_DIM(context->blocks); m++) { + if (context->blocks[m].block_size == + alloc_size && context->blocks[m].mem_ptr != NULL) { + p = context->blocks[m].mem_ptr; + context->blocks[m].mem_ptr = *((void **)p); + memset(p, 0, alloc_size); + return p; + } + } + + /* + * return allocation from memory pool + */ + p = tb_alloc(&context->pool, alloc_size); + return p; +} + +/* + * Free memory blocks (kept in context for reuse). + */ +static void +acl_build_free(struct acl_build_context *context, size_t s, void *p) +{ + uint32_t n; + + for (n = 0; n < RTE_DIM(context->blocks); n++) { + if (context->blocks[n].block_size == s) { + *((void **)p) = context->blocks[n].mem_ptr; + context->blocks[n].mem_ptr = p; + return; + } + } + for (n = 0; n < RTE_DIM(context->blocks); n++) { + if (context->blocks[n].block_size == 0) { + context->blocks[n].block_size = s; + *((void **)p) = NULL; + context->blocks[n].mem_ptr = p; + return; + } + } +} + +/* + * Allocate and initialize a new node. + */ +static struct rte_acl_node * +acl_alloc_node(struct acl_build_context *context, int level) +{ + struct rte_acl_node *node; + + if (context->node_free_list != NULL) { + node = context->node_free_list; + context->node_free_list = node->next; + memset(node, 0, sizeof(struct rte_acl_node)); + } else { + node = acl_build_alloc(context, sizeof(struct rte_acl_node), 1); + } + + if (node != NULL) { + node->num_ptrs = 0; + node->level = level; + node->node_type = RTE_ACL_NODE_UNDEFINED; + node->node_index = RTE_ACL_NODE_UNDEFINED; + context->num_nodes++; + node->id = context->node_id++; + } + return node; +} + +/* + * Dereference all nodes to which this node points + */ +static void +acl_free_node(struct acl_build_context *context, + struct rte_acl_node *node) +{ + uint32_t n; + + if (node->prev != NULL) + node->prev->next = NULL; + for (n = 0; n < node->num_ptrs; n++) + acl_deref_ptr(context, node, n); + + /* free mrt if this is a match node */ + if (node->mrt != NULL) { + acl_build_free(context, sizeof(struct rte_acl_match_results), + node->mrt); + node->mrt = NULL; + } + + /* free transitions to other nodes */ + if (node->ptrs != NULL) { + acl_build_free(context, + node->max_ptrs * sizeof(struct rte_acl_ptr_set), + node->ptrs); + node->ptrs = NULL; + } + + /* put it on the free list */ + context->num_nodes--; + node->next = context->node_free_list; + context->node_free_list = node; +} + + +/* + * Include src bitset in dst bitset + */ +static void +acl_include(struct rte_acl_bitset *dst, struct rte_acl_bitset *src, bits_t mask) +{ + uint32_t n; + + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) + dst->bits[n] = (dst->bits[n] & mask) | src->bits[n]; +} + +/* + * Set dst to bits of src1 that are not in src2 + */ +static int +acl_exclude(struct rte_acl_bitset *dst, + struct rte_acl_bitset *src1, + struct rte_acl_bitset *src2) +{ + uint32_t n; + bits_t all_bits = 0; + + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) { + dst->bits[n] = src1->bits[n] & ~src2->bits[n]; + all_bits |= dst->bits[n]; + } + return all_bits != 0; +} + +/* + * Add a pointer (ptr) to a node. + */ +static int +acl_add_ptr(struct acl_build_context *context, + struct rte_acl_node *node, + struct rte_acl_node *ptr, + struct rte_acl_bitset *bits) +{ + uint32_t n, num_ptrs; + struct rte_acl_ptr_set *ptrs = NULL; + + /* + * If there's already a pointer to the same node, just add to the bitset + */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) { + if (node->ptrs[n].ptr == ptr) { + acl_include(&node->ptrs[n].values, bits, -1); + acl_include(&node->values, bits, -1); + return 0; + } + } + } + + /* if there's no room for another pointer, make room */ + if (node->num_ptrs >= node->max_ptrs) { + /* add room for more pointers */ + num_ptrs = node->max_ptrs + ACL_PTR_ALLOC; + ptrs = acl_build_alloc(context, num_ptrs, sizeof(*ptrs)); + + /* copy current points to new memory allocation */ + if (node->ptrs != NULL) { + memcpy(ptrs, node->ptrs, + node->num_ptrs * sizeof(*ptrs)); + acl_build_free(context, node->max_ptrs * sizeof(*ptrs), + node->ptrs); + } + node->ptrs = ptrs; + node->max_ptrs = num_ptrs; + } + + /* Find available ptr and add a new pointer to this node */ + for (n = node->min_add; n < node->max_ptrs; n++) { + if (node->ptrs[n].ptr == NULL) { + node->ptrs[n].ptr = ptr; + acl_include(&node->ptrs[n].values, bits, 0); + acl_include(&node->values, bits, -1); + if (ptr != NULL) + ptr->ref_count++; + if (node->num_ptrs <= n) + node->num_ptrs = n + 1; + return 0; + } + } + + return 0; +} + +/* + * Add a pointer for a range of values + */ +static int +acl_add_ptr_range(struct acl_build_context *context, + struct rte_acl_node *root, + struct rte_acl_node *node, + uint8_t low, + uint8_t high) +{ + uint32_t n; + struct rte_acl_bitset bitset; + + /* clear the bitset values */ + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) + bitset.bits[n] = 0; + + /* for each bit in range, add bit to set */ + for (n = 0; n < UINT8_MAX + 1; n++) + if (n >= low && n <= high) + bitset.bits[n / (sizeof(bits_t) * 8)] |= + 1 << (n % (sizeof(bits_t) * 8)); + + return acl_add_ptr(context, root, node, &bitset); +} + +/* + * Generate a bitset from a byte value and mask. + */ +static int +acl_gen_mask(struct rte_acl_bitset *bitset, uint32_t value, uint32_t mask) +{ + int range = 0; + uint32_t n; + + /* clear the bitset values */ + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) + bitset->bits[n] = 0; + + /* for each bit in value/mask, add bit to set */ + for (n = 0; n < UINT8_MAX + 1; n++) { + if ((n & mask) == value) { + range++; + bitset->bits[n / (sizeof(bits_t) * 8)] |= + 1 << (n % (sizeof(bits_t) * 8)); + } + } + return range; +} + +/* + * Determine how A and B intersect. + * Determine if A and/or B are supersets of the intersection. + */ +static int +acl_intersect_type(const struct rte_acl_bitset *a_bits, + const struct rte_acl_bitset *b_bits, + struct rte_acl_bitset *intersect) +{ + uint32_t n; + bits_t intersect_bits = 0; + bits_t a_superset = 0; + bits_t b_superset = 0; + + /* + * calculate and store intersection and check if A and/or B have + * bits outside the intersection (superset) + */ + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) { + intersect->bits[n] = a_bits->bits[n] & b_bits->bits[n]; + a_superset |= a_bits->bits[n] ^ intersect->bits[n]; + b_superset |= b_bits->bits[n] ^ intersect->bits[n]; + intersect_bits |= intersect->bits[n]; + } + + n = (intersect_bits == 0 ? ACL_INTERSECT_NONE : ACL_INTERSECT) | + (b_superset == 0 ? 0 : ACL_INTERSECT_B) | + (a_superset == 0 ? 0 : ACL_INTERSECT_A); + + return n; +} + +/* + * Duplicate a node + */ +static struct rte_acl_node * +acl_dup_node(struct acl_build_context *context, struct rte_acl_node *node) +{ + uint32_t n; + struct rte_acl_node *next; + + next = acl_alloc_node(context, node->level); + + /* allocate the pointers */ + if (node->num_ptrs > 0) { + next->ptrs = acl_build_alloc(context, + node->max_ptrs, + sizeof(struct rte_acl_ptr_set)); + next->max_ptrs = node->max_ptrs; + } + + /* copy over the pointers */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) { + next->ptrs[n].ptr = node->ptrs[n].ptr; + next->ptrs[n].ptr->ref_count++; + acl_include(&next->ptrs[n].values, + &node->ptrs[n].values, -1); + } + } + + next->num_ptrs = node->num_ptrs; + + /* copy over node's match results */ + if (node->match_flag == 0) + next->match_flag = 0; + else { + next->match_flag = -1; + next->mrt = acl_build_alloc(context, 1, sizeof(*next->mrt)); + memcpy(next->mrt, node->mrt, sizeof(*next->mrt)); + } + + /* copy over node's bitset */ + acl_include(&next->values, &node->values, -1); + + node->next = next; + next->prev = node; + + return next; +} + +/* + * Dereference a pointer from a node + */ +static void +acl_deref_ptr(struct acl_build_context *context, + struct rte_acl_node *node, int index) +{ + struct rte_acl_node *ref_node; + + /* De-reference the node at the specified pointer */ + if (node != NULL && node->ptrs[index].ptr != NULL) { + ref_node = node->ptrs[index].ptr; + ref_node->ref_count--; + if (ref_node->ref_count == 0) + acl_free_node(context, ref_node); + } +} + +/* + * acl_exclude rte_acl_bitset from src and copy remaining pointer to dst + */ +static int +acl_copy_ptr(struct acl_build_context *context, + struct rte_acl_node *dst, + struct rte_acl_node *src, + int index, + struct rte_acl_bitset *b_bits) +{ + int rc; + struct rte_acl_bitset bits; + + if (b_bits != NULL) + if (!acl_exclude(&bits, &src->ptrs[index].values, b_bits)) + return 0; + + rc = acl_add_ptr(context, dst, src->ptrs[index].ptr, &bits); + if (rc < 0) + return rc; + return 1; +} + +/* + * Fill in gaps in ptrs list with the ptr at the end of the list + */ +static void +acl_compact_node_ptrs(struct rte_acl_node *node_a) +{ + uint32_t n; + int min_add = node_a->min_add; + + while (node_a->num_ptrs > 0 && + node_a->ptrs[node_a->num_ptrs - 1].ptr == NULL) + node_a->num_ptrs--; + + for (n = min_add; n + 1 < node_a->num_ptrs; n++) { + + /* if this entry is empty */ + if (node_a->ptrs[n].ptr == NULL) { + + /* move the last pointer to this entry */ + acl_include(&node_a->ptrs[n].values, + &node_a->ptrs[node_a->num_ptrs - 1].values, + 0); + node_a->ptrs[n].ptr = + node_a->ptrs[node_a->num_ptrs - 1].ptr; + + /* + * mark the end as empty and adjust the number + * of used pointer enum_tries + */ + node_a->ptrs[node_a->num_ptrs - 1].ptr = NULL; + while (node_a->num_ptrs > 0 && + node_a->ptrs[node_a->num_ptrs - 1].ptr == NULL) + node_a->num_ptrs--; + } + } +} + +static int +acl_resolve_leaf(struct acl_build_context *context, + struct rte_acl_node *node_a, + struct rte_acl_node *node_b, + struct rte_acl_node **node_c) +{ + uint32_t n; + int combined_priority = ACL_PRIORITY_EQUAL; + + for (n = 0; n < context->cfg.num_categories; n++) { + if (node_a->mrt->priority[n] != node_b->mrt->priority[n]) { + combined_priority |= (node_a->mrt->priority[n] > + node_b->mrt->priority[n]) ? + ACL_PRIORITY_NODE_A : ACL_PRIORITY_NODE_B; + } + } + + /* + * if node a is higher or equal priority for all categories, + * then return node_a. + */ + if (combined_priority == ACL_PRIORITY_NODE_A || + combined_priority == ACL_PRIORITY_EQUAL) { + *node_c = node_a; + return 0; + } + + /* + * if node b is higher or equal priority for all categories, + * then return node_b. + */ + if (combined_priority == ACL_PRIORITY_NODE_B) { + *node_c = node_b; + return 0; + } + + /* + * mixed priorities - create a new node with the highest priority + * for each category. + */ + + /* force new duplication. */ + node_a->next = NULL; + + *node_c = acl_dup_node(context, node_a); + for (n = 0; n < context->cfg.num_categories; n++) { + if ((*node_c)->mrt->priority[n] < node_b->mrt->priority[n]) { + (*node_c)->mrt->priority[n] = node_b->mrt->priority[n]; + (*node_c)->mrt->results[n] = node_b->mrt->results[n]; + } + } + return 0; +} + +/* + * Merge nodes A and B together, + * returns a node that is the path for the intersection + * + * If match node (leaf on trie) + * For each category + * return node = highest priority result + * + * Create C as a duplicate of A to point to child intersections + * If any pointers in C intersect with any in B + * For each intersection + * merge children + * remove intersection from C pointer + * add a pointer from C to child intersection node + * Compact the pointers in A and B + * Copy any B pointers that are outside of the intersection to C + * If C has no references to the B trie + * free C and return A + * Else If C has no references to the A trie + * free C and return B + * Else + * return C + */ +static int +acl_merge_trie(struct acl_build_context *context, + struct rte_acl_node *node_a, struct rte_acl_node *node_b, + uint32_t level, struct rte_acl_node **return_c) +{ + uint32_t n, m, ptrs_c, ptrs_b; + uint32_t min_add_c, min_add_b; + int node_intersect_type; + struct rte_acl_bitset node_intersect; + struct rte_acl_node *node_c; + struct rte_acl_node *node_a_next; + int node_b_refs; + int node_a_refs; + + node_c = node_a; + node_a_next = node_a->next; + min_add_c = 0; + min_add_b = 0; + node_a_refs = node_a->num_ptrs; + node_b_refs = 0; + node_intersect_type = 0; + + /* Resolve leaf nodes (matches) */ + if (node_a->match_flag != 0) { + acl_resolve_leaf(context, node_a, node_b, return_c); + return 0; + } + + /* + * Create node C as a copy of node A, and do: C = merge(A,B); + * If node A can be used instead (A==C), then later we'll + * destroy C and return A. + */ + if (level > 0) + node_c = acl_dup_node(context, node_a); + + /* + * If the two node transitions intersect then merge the transitions. + * Check intersection for entire node (all pointers) + */ + node_intersect_type = acl_intersect_type(&node_c->values, + &node_b->values, + &node_intersect); + + if (node_intersect_type & ACL_INTERSECT) { + + min_add_b = node_b->min_add; + node_b->min_add = node_b->num_ptrs; + ptrs_b = node_b->num_ptrs; + + min_add_c = node_c->min_add; + node_c->min_add = node_c->num_ptrs; + ptrs_c = node_c->num_ptrs; + + for (n = 0; n < ptrs_c; n++) { + if (node_c->ptrs[n].ptr == NULL) { + node_a_refs--; + continue; + } + node_c->ptrs[n].ptr->next = NULL; + for (m = 0; m < ptrs_b; m++) { + + struct rte_acl_bitset child_intersect; + int child_intersect_type; + struct rte_acl_node *child_node_c = NULL; + + if (node_b->ptrs[m].ptr == NULL || + node_c->ptrs[n].ptr == + node_b->ptrs[m].ptr) + continue; + + child_intersect_type = acl_intersect_type( + &node_c->ptrs[n].values, + &node_b->ptrs[m].values, + &child_intersect); + + if ((child_intersect_type & ACL_INTERSECT) != + 0) { + if (acl_merge_trie(context, + node_c->ptrs[n].ptr, + node_b->ptrs[m].ptr, + level + 1, + &child_node_c)) + return 1; + + if (child_node_c != NULL && + child_node_c != + node_c->ptrs[n].ptr) { + + node_b_refs++; + + /* + * Added link from C to + * child_C for all transitions + * in the intersection. + */ + acl_add_ptr(context, node_c, + child_node_c, + &child_intersect); + + /* + * inc refs if pointer is not + * to node b. + */ + node_a_refs += (child_node_c != + node_b->ptrs[m].ptr); + + /* + * Remove intersection from C + * pointer. + */ + if (!acl_exclude( + &node_c->ptrs[n].values, + &node_c->ptrs[n].values, + &child_intersect)) { + acl_deref_ptr(context, + node_c, n); + node_c->ptrs[n].ptr = + NULL; + node_a_refs--; + } + } + } + } + } + + /* Compact pointers */ + node_c->min_add = min_add_c; + acl_compact_node_ptrs(node_c); + node_b->min_add = min_add_b; + acl_compact_node_ptrs(node_b); + } + + /* + * Copy pointers outside of the intersection from B to C + */ + if ((node_intersect_type & ACL_INTERSECT_B) != 0) { + node_b_refs++; + for (m = 0; m < node_b->num_ptrs; m++) + if (node_b->ptrs[m].ptr != NULL) + acl_copy_ptr(context, node_c, + node_b, m, &node_intersect); + } + + /* + * Free node C if top of trie is contained in A or B + * if node C is a duplicate of node A && + * node C was not an existing duplicate + */ + if (node_c != node_a && node_c != node_a_next) { + + /* + * if the intersection has no references to the + * B side, then it is contained in A + */ + if (node_b_refs == 0) { + acl_free_node(context, node_c); + node_c = node_a; + } else { + /* + * if the intersection has no references to the + * A side, then it is contained in B. + */ + if (node_a_refs == 0) { + acl_free_node(context, node_c); + node_c = node_b; + } + } + } + + if (return_c != NULL) + *return_c = node_c; + + if (level == 0) + acl_free_node(context, node_b); + + return 0; +} + +/* + * Reset current runtime fields before next build: + * - free allocated RT memory. + * - reset all RT related fields to zero. + */ +static void +acl_build_reset(struct rte_acl_ctx *ctx) +{ + rte_free(ctx->mem); + memset(&ctx->num_categories, 0, + sizeof(*ctx) - offsetof(struct rte_acl_ctx, num_categories)); +} + +static void +acl_gen_range(struct acl_build_context *context, + const uint8_t *hi, const uint8_t *lo, int size, int level, + struct rte_acl_node *root, struct rte_acl_node *end) +{ + struct rte_acl_node *node, *prev; + uint32_t n; + + prev = root; + for (n = size - 1; n > 0; n--) { + node = acl_alloc_node(context, level++); + acl_add_ptr_range(context, prev, node, lo[n], hi[n]); + prev = node; + } + acl_add_ptr_range(context, prev, end, lo[0], hi[0]); +} + +static struct rte_acl_node * +acl_gen_range_trie(struct acl_build_context *context, + const void *min, const void *max, + int size, int level, struct rte_acl_node **pend) +{ + int32_t n; + struct rte_acl_node *root; + const uint8_t *lo = min; + const uint8_t *hi = max; + + *pend = acl_alloc_node(context, level+size); + root = acl_alloc_node(context, level++); + + if (lo[size - 1] == hi[size - 1]) { + acl_gen_range(context, hi, lo, size, level, root, *pend); + } else { + uint8_t limit_lo[64]; + uint8_t limit_hi[64]; + uint8_t hi_ff = UINT8_MAX; + uint8_t lo_00 = 0; + + memset(limit_lo, 0, RTE_DIM(limit_lo)); + memset(limit_hi, UINT8_MAX, RTE_DIM(limit_hi)); + + for (n = size - 2; n >= 0; n--) { + hi_ff = (uint8_t)(hi_ff & hi[n]); + lo_00 = (uint8_t)(lo_00 | lo[n]); + } + + if (hi_ff != UINT8_MAX) { + limit_lo[size - 1] = hi[size - 1]; + acl_gen_range(context, hi, limit_lo, size, level, + root, *pend); + } + + if (lo_00 != 0) { + limit_hi[size - 1] = lo[size - 1]; + acl_gen_range(context, limit_hi, lo, size, level, + root, *pend); + } + + if (hi[size - 1] - lo[size - 1] > 1 || + lo_00 == 0 || + hi_ff == UINT8_MAX) { + limit_lo[size-1] = (uint8_t)(lo[size-1] + (lo_00 != 0)); + limit_hi[size-1] = (uint8_t)(hi[size-1] - + (hi_ff != UINT8_MAX)); + acl_gen_range(context, limit_hi, limit_lo, size, + level, root, *pend); + } + } + return root; +} + +static struct rte_acl_node * +acl_gen_mask_trie(struct acl_build_context *context, + const void *value, const void *mask, + int size, int level, struct rte_acl_node **pend) +{ + int32_t n; + struct rte_acl_node *root; + struct rte_acl_node *node, *prev; + struct rte_acl_bitset bits; + const uint8_t *val = value; + const uint8_t *msk = mask; + + root = acl_alloc_node(context, level++); + prev = root; + + for (n = size - 1; n >= 0; n--) { + node = acl_alloc_node(context, level++); + acl_gen_mask(&bits, val[n] & msk[n], msk[n]); + acl_add_ptr(context, prev, node, &bits); + prev = node; + } + + *pend = prev; + return root; +} + +static struct rte_acl_node * +build_trie(struct acl_build_context *context, struct rte_acl_build_rule *head, + struct rte_acl_build_rule **last, uint32_t *count) +{ + uint32_t n, m; + int field_index, node_count; + struct rte_acl_node *trie; + struct rte_acl_build_rule *prev, *rule; + struct rte_acl_node *end, *merge, *root, *end_prev; + const struct rte_acl_field *fld; + + prev = head; + rule = head; + *last = prev; + + trie = acl_alloc_node(context, 0); + + while (rule != NULL) { + + root = acl_alloc_node(context, 0); + + root->ref_count = 1; + end = root; + + for (n = 0; n < rule->config->num_fields; n++) { + + field_index = rule->config->defs[n].field_index; + fld = rule->f->field + field_index; + end_prev = end; + + /* build a mini-trie for this field */ + switch (rule->config->defs[n].type) { + + case RTE_ACL_FIELD_TYPE_BITMASK: + merge = acl_gen_mask_trie(context, + &fld->value, + &fld->mask_range, + rule->config->defs[n].size, + end->level + 1, + &end); + break; + + case RTE_ACL_FIELD_TYPE_MASK: + { + /* + * set msb for the size of the field and + * all higher bits. + */ + uint64_t mask; + mask = RTE_ACL_MASKLEN_TO_BITMASK( + fld->mask_range.u32, + rule->config->defs[n].size); + + /* gen a mini-trie for this field */ + merge = acl_gen_mask_trie(context, + &fld->value, + (char *)&mask, + rule->config->defs[n].size, + end->level + 1, + &end); + } + break; + + case RTE_ACL_FIELD_TYPE_RANGE: + merge = acl_gen_range_trie(context, + &rule->f->field[field_index].value, + &rule->f->field[field_index].mask_range, + rule->config->defs[n].size, + end->level + 1, + &end); + break; + + default: + RTE_LOG(ERR, ACL, + "Error in rule[%u] type - %hhu\n", + rule->f->data.userdata, + rule->config->defs[n].type); + return NULL; + } + + /* merge this field on to the end of the rule */ + if (acl_merge_trie(context, end_prev, merge, 0, + NULL) != 0) { + return NULL; + } + } + + end->match_flag = ++context->num_build_rules; + + /* + * Setup the results for this rule. + * The result and priority of each category. + */ + if (end->mrt == NULL) + end->mrt = acl_build_alloc(context, 1, + sizeof(*end->mrt)); + + for (m = context->cfg.num_categories; 0 != m--; ) { + if (rule->f->data.category_mask & (1 << m)) { + end->mrt->results[m] = rule->f->data.userdata; + end->mrt->priority[m] = rule->f->data.priority; + } else { + end->mrt->results[m] = 0; + end->mrt->priority[m] = 0; + } + } + + node_count = context->num_nodes; + (*count)++; + + /* merge this rule into the trie */ + if (acl_merge_trie(context, trie, root, 0, NULL)) + return NULL; + + node_count = context->num_nodes - node_count; + if (node_count > context->cur_node_max) { + *last = prev; + return trie; + } + + prev = rule; + rule = rule->next; + } + + *last = NULL; + return trie; +} + +static void +acl_calc_wildness(struct rte_acl_build_rule *head, + const struct rte_acl_config *config) +{ + uint32_t n; + struct rte_acl_build_rule *rule; + + for (rule = head; rule != NULL; rule = rule->next) { + + for (n = 0; n < config->num_fields; n++) { + + double wild = 0; + uint32_t bit_len = CHAR_BIT * config->defs[n].size; + uint64_t msk_val = RTE_LEN2MASK(bit_len, + typeof(msk_val)); + double size = bit_len; + int field_index = config->defs[n].field_index; + const struct rte_acl_field *fld = rule->f->field + + field_index; + + switch (rule->config->defs[n].type) { + case RTE_ACL_FIELD_TYPE_BITMASK: + wild = (size - __builtin_popcountll( + fld->mask_range.u64 & msk_val)) / + size; + break; + + case RTE_ACL_FIELD_TYPE_MASK: + wild = (size - fld->mask_range.u32) / size; + break; + + case RTE_ACL_FIELD_TYPE_RANGE: + wild = (fld->mask_range.u64 & msk_val) - + (fld->value.u64 & msk_val); + wild = wild / msk_val; + break; + } + + rule->wildness[field_index] = (uint32_t)(wild * 100); + } + } +} + +static void +acl_rule_stats(struct rte_acl_build_rule *head, struct rte_acl_config *config) +{ + struct rte_acl_build_rule *rule; + uint32_t n, m, fields_deactivated = 0; + uint32_t start = 0, deactivate = 0; + int tally[RTE_ACL_MAX_LEVELS][TALLY_NUM]; + + memset(tally, 0, sizeof(tally)); + + for (rule = head; rule != NULL; rule = rule->next) { + + for (n = 0; n < config->num_fields; n++) { + uint32_t field_index = config->defs[n].field_index; + + tally[n][TALLY_0]++; + for (m = 1; m < RTE_DIM(wild_limits); m++) { + if (rule->wildness[field_index] >= + wild_limits[m]) + tally[n][m]++; + } + } + + for (n = config->num_fields - 1; n > 0; n--) { + uint32_t field_index = config->defs[n].field_index; + + if (rule->wildness[field_index] == 100) + tally[n][TALLY_DEPTH]++; + else + break; + } + } + + /* + * Look for any field that is always wild and drop it from the config + * Only deactivate if all fields for a given input loop are deactivated. + */ + for (n = 1; n < config->num_fields; n++) { + if (config->defs[n].input_index != + config->defs[n - 1].input_index) { + for (m = start; m < n; m++) + tally[m][TALLY_DEACTIVATED] = deactivate; + fields_deactivated += deactivate; + start = n; + deactivate = 1; + } + + /* if the field is not always completely wild */ + if (tally[n][TALLY_100] != tally[n][TALLY_0]) + deactivate = 0; + } + + for (m = start; m < n; m++) + tally[m][TALLY_DEACTIVATED] = deactivate; + + fields_deactivated += deactivate; + + /* remove deactivated fields */ + if (fields_deactivated) { + uint32_t k, l = 0; + + for (k = 0; k < config->num_fields; k++) { + if (tally[k][TALLY_DEACTIVATED] == 0) { + memmove(&tally[l][0], &tally[k][0], + TALLY_NUM * sizeof(tally[0][0])); + memmove(&config->defs[l++], + &config->defs[k], + sizeof(struct rte_acl_field_def)); + } + } + config->num_fields = l; + } +} + +static int +rule_cmp_wildness(struct rte_acl_build_rule *r1, struct rte_acl_build_rule *r2) +{ + uint32_t n; + + for (n = 1; n < r1->config->num_fields; n++) { + int field_index = r1->config->defs[n].field_index; + + if (r1->wildness[field_index] != r2->wildness[field_index]) + return r1->wildness[field_index] - + r2->wildness[field_index]; + } + return 0; +} + +/* + * Split the rte_acl_build_rule list into two lists. + */ +static void +rule_list_split(struct rte_acl_build_rule *source, + struct rte_acl_build_rule **list_a, + struct rte_acl_build_rule **list_b) +{ + struct rte_acl_build_rule *fast; + struct rte_acl_build_rule *slow; + + if (source == NULL || source->next == NULL) { + /* length < 2 cases */ + *list_a = source; + *list_b = NULL; + } else { + slow = source; + fast = source->next; + /* Advance 'fast' two nodes, and advance 'slow' one node */ + while (fast != NULL) { + fast = fast->next; + if (fast != NULL) { + slow = slow->next; + fast = fast->next; + } + } + /* 'slow' is before the midpoint in the list, so split it in two + at that point. */ + *list_a = source; + *list_b = slow->next; + slow->next = NULL; + } +} + +/* + * Merge two sorted lists. + */ +static struct rte_acl_build_rule * +rule_list_sorted_merge(struct rte_acl_build_rule *a, + struct rte_acl_build_rule *b) +{ + struct rte_acl_build_rule *result = NULL; + struct rte_acl_build_rule **last_next = &result; + + while (1) { + if (a == NULL) { + *last_next = b; + break; + } else if (b == NULL) { + *last_next = a; + break; + } + if (rule_cmp_wildness(a, b) >= 0) { + *last_next = a; + last_next = &a->next; + a = a->next; + } else { + *last_next = b; + last_next = &b->next; + b = b->next; + } + } + return result; +} + +/* + * Sort list of rules based on the rules wildness. + * Use recursive mergesort algorithm. + */ +static struct rte_acl_build_rule * +sort_rules(struct rte_acl_build_rule *head) +{ + struct rte_acl_build_rule *a; + struct rte_acl_build_rule *b; + + /* Base case -- length 0 or 1 */ + if (head == NULL || head->next == NULL) + return head; + + /* Split head into 'a' and 'b' sublists */ + rule_list_split(head, &a, &b); + + /* Recursively sort the sublists */ + a = sort_rules(a); + b = sort_rules(b); + + /* answer = merge the two sorted lists together */ + return rule_list_sorted_merge(a, b); +} + +static uint32_t +acl_build_index(const struct rte_acl_config *config, uint32_t *data_index) +{ + uint32_t n, m; + int32_t last_header; + + m = 0; + last_header = -1; + + for (n = 0; n < config->num_fields; n++) { + if (last_header != config->defs[n].input_index) { + last_header = config->defs[n].input_index; + data_index[m++] = config->defs[n].offset; + } + } + + return m; +} + +static struct rte_acl_build_rule * +build_one_trie(struct acl_build_context *context, + struct rte_acl_build_rule *rule_sets[RTE_ACL_MAX_TRIES], + uint32_t n, int32_t node_max) +{ + struct rte_acl_build_rule *last; + struct rte_acl_config *config; + + config = rule_sets[n]->config; + + acl_rule_stats(rule_sets[n], config); + rule_sets[n] = sort_rules(rule_sets[n]); + + context->tries[n].type = RTE_ACL_FULL_TRIE; + context->tries[n].count = 0; + + context->tries[n].num_data_indexes = acl_build_index(config, + context->data_indexes[n]); + context->tries[n].data_index = context->data_indexes[n]; + + context->cur_node_max = node_max; + + context->bld_tries[n].trie = build_trie(context, rule_sets[n], + &last, &context->tries[n].count); + + return last; +} + +static int +acl_build_tries(struct acl_build_context *context, + struct rte_acl_build_rule *head) +{ + uint32_t n, num_tries; + struct rte_acl_config *config; + struct rte_acl_build_rule *last; + struct rte_acl_build_rule *rule_sets[RTE_ACL_MAX_TRIES]; + + config = head->config; + rule_sets[0] = head; + + /* initialize tries */ + for (n = 0; n < RTE_DIM(context->tries); n++) { + context->tries[n].type = RTE_ACL_UNUSED_TRIE; + context->bld_tries[n].trie = NULL; + context->tries[n].count = 0; + } + + context->tries[0].type = RTE_ACL_FULL_TRIE; + + /* calc wildness of each field of each rule */ + acl_calc_wildness(head, config); + + for (n = 0;; n = num_tries) { + + num_tries = n + 1; + + last = build_one_trie(context, rule_sets, n, context->node_max); + if (context->bld_tries[n].trie == NULL) { + RTE_LOG(ERR, ACL, "Build of %u-th trie failed\n", n); + return -ENOMEM; + } + + /* Build of the last trie completed. */ + if (last == NULL) + break; + + if (num_tries == RTE_DIM(context->tries)) { + RTE_LOG(ERR, ACL, + "Exceeded max number of tries: %u\n", + num_tries); + return -ENOMEM; + } + + /* Trie is getting too big, split remaining rule set. */ + rule_sets[num_tries] = last->next; + last->next = NULL; + acl_free_node(context, context->bld_tries[n].trie); + + /* Create a new copy of config for remaining rules. */ + config = acl_build_alloc(context, 1, sizeof(*config)); + memcpy(config, rule_sets[n]->config, sizeof(*config)); + + /* Make remaining rules use new config. */ + for (head = rule_sets[num_tries]; head != NULL; + head = head->next) + head->config = config; + + /* + * Rebuild the trie for the reduced rule-set. + * Don't try to split it any further. + */ + last = build_one_trie(context, rule_sets, n, INT32_MAX); + if (context->bld_tries[n].trie == NULL || last != NULL) { + RTE_LOG(ERR, ACL, "Build of %u-th trie failed\n", n); + return -ENOMEM; + } + + } + + context->num_tries = num_tries; + return 0; +} + +static void +acl_build_log(const struct acl_build_context *ctx) +{ + uint32_t n; + + RTE_LOG(DEBUG, ACL, "Build phase for ACL \"%s\":\n" + "node limit for tree split: %u\n" + "nodes created: %u\n" + "memory consumed: %zu\n", + ctx->acx->name, + ctx->node_max, + ctx->num_nodes, + ctx->pool.alloc); + + for (n = 0; n < RTE_DIM(ctx->tries); n++) { + if (ctx->tries[n].count != 0) + RTE_LOG(DEBUG, ACL, + "trie %u: number of rules: %u, indexes: %u\n", + n, ctx->tries[n].count, + ctx->tries[n].num_data_indexes); + } +} + +static int +acl_build_rules(struct acl_build_context *bcx) +{ + struct rte_acl_build_rule *br, *head; + const struct rte_acl_rule *rule; + uint32_t *wp; + uint32_t fn, i, n, num; + size_t ofs, sz; + + fn = bcx->cfg.num_fields; + n = bcx->acx->num_rules; + ofs = n * sizeof(*br); + sz = ofs + n * fn * sizeof(*wp); + + br = tb_alloc(&bcx->pool, sz); + + wp = (uint32_t *)((uintptr_t)br + ofs); + num = 0; + head = NULL; + + for (i = 0; i != n; i++) { + rule = (const struct rte_acl_rule *) + ((uintptr_t)bcx->acx->rules + bcx->acx->rule_sz * i); + if ((rule->data.category_mask & bcx->category_mask) != 0) { + br[num].next = head; + br[num].config = &bcx->cfg; + br[num].f = rule; + br[num].wildness = wp; + wp += fn; + head = br + num; + num++; + } + } + + bcx->num_rules = num; + bcx->build_rules = head; + + return 0; +} + +/* + * Copy data_indexes for each trie into RT location. + */ +static void +acl_set_data_indexes(struct rte_acl_ctx *ctx) +{ + uint32_t i, n, ofs; + + ofs = 0; + for (i = 0; i != ctx->num_tries; i++) { + n = ctx->trie[i].num_data_indexes; + memcpy(ctx->data_indexes + ofs, ctx->trie[i].data_index, + n * sizeof(ctx->data_indexes[0])); + ctx->trie[i].data_index = ctx->data_indexes + ofs; + ofs += RTE_ACL_MAX_FIELDS; + } +} + +/* + * Internal routine, performs 'build' phase of trie generation: + * - setups build context. + * - analizes given set of rules. + * - builds internal tree(s). + */ +static int +acl_bld(struct acl_build_context *bcx, struct rte_acl_ctx *ctx, + const struct rte_acl_config *cfg, uint32_t node_max) +{ + int32_t rc; + + /* setup build context. */ + memset(bcx, 0, sizeof(*bcx)); + bcx->acx = ctx; + bcx->pool.alignment = ACL_POOL_ALIGN; + bcx->pool.min_alloc = ACL_POOL_ALLOC_MIN; + bcx->cfg = *cfg; + bcx->category_mask = RTE_LEN2MASK(bcx->cfg.num_categories, + typeof(bcx->category_mask)); + bcx->node_max = node_max; + + rc = sigsetjmp(bcx->pool.fail, 0); + + /* build phase runs out of memory. */ + if (rc != 0) { + RTE_LOG(ERR, ACL, + "ACL context: %s, %s() failed with error code: %d\n", + bcx->acx->name, __func__, rc); + return rc; + } + + /* Create a build rules copy. */ + rc = acl_build_rules(bcx); + if (rc != 0) + return rc; + + /* No rules to build for that context+config */ + if (bcx->build_rules == NULL) { + rc = -EINVAL; + } else { + /* build internal trie representation. */ + rc = acl_build_tries(bcx, bcx->build_rules); + } + return rc; +} + +/* + * Check that parameters for acl_build() are valid. + */ +static int +acl_check_bld_param(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg) +{ + static const size_t field_sizes[] = { + sizeof(uint8_t), sizeof(uint16_t), + sizeof(uint32_t), sizeof(uint64_t), + }; + + uint32_t i, j; + + if (ctx == NULL || cfg == NULL || cfg->num_categories == 0 || + cfg->num_categories > RTE_ACL_MAX_CATEGORIES || + cfg->num_fields == 0 || + cfg->num_fields > RTE_ACL_MAX_FIELDS) + return -EINVAL; + + for (i = 0; i != cfg->num_fields; i++) { + if (cfg->defs[i].type > RTE_ACL_FIELD_TYPE_BITMASK) { + RTE_LOG(ERR, ACL, + "ACL context: %s, invalid type: %hhu for %u-th field\n", + ctx->name, cfg->defs[i].type, i); + return -EINVAL; + } + for (j = 0; + j != RTE_DIM(field_sizes) && + cfg->defs[i].size != field_sizes[j]; + j++) + ; + + if (j == RTE_DIM(field_sizes)) { + RTE_LOG(ERR, ACL, + "ACL context: %s, invalid size: %hhu for %u-th field\n", + ctx->name, cfg->defs[i].size, i); + return -EINVAL; + } + } + + return 0; +} + +int +rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg) +{ + int32_t rc; + uint32_t n; + size_t max_size; + struct acl_build_context bcx; + + rc = acl_check_bld_param(ctx, cfg); + if (rc != 0) + return rc; + + acl_build_reset(ctx); + + if (cfg->max_size == 0) { + n = NODE_MIN; + max_size = SIZE_MAX; + } else { + n = NODE_MAX; + max_size = cfg->max_size; + } + + for (rc = -ERANGE; n >= NODE_MIN && rc == -ERANGE; n /= 2) { + + /* perform build phase. */ + rc = acl_bld(&bcx, ctx, cfg, n); + + if (rc == 0) { + /* allocate and fill run-time structures. */ + rc = rte_acl_gen(ctx, bcx.tries, bcx.bld_tries, + bcx.num_tries, bcx.cfg.num_categories, + RTE_ACL_MAX_FIELDS * RTE_DIM(bcx.tries) * + sizeof(ctx->data_indexes[0]), max_size); + if (rc == 0) { + /* set data indexes. */ + acl_set_data_indexes(ctx); + + /* copy in build config. */ + ctx->config = *cfg; + } + } + + acl_build_log(&bcx); + + /* cleanup after build. */ + tb_free_pool(&bcx.pool); + } + + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_gen.c b/src/spdk/dpdk/lib/librte_acl/acl_gen.c new file mode 100644 index 00000000..bed66be0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_gen.c @@ -0,0 +1,532 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include "acl.h" + +#define QRANGE_MIN ((uint8_t)INT8_MIN) + +#define RTE_ACL_VERIFY(exp) do { \ + if (!(exp)) \ + rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \ +} while (0) + +struct acl_node_counters { + int32_t match; + int32_t match_used; + int32_t single; + int32_t quad; + int32_t quad_vectors; + int32_t dfa; + int32_t dfa_gr64; +}; + +struct rte_acl_indices { + int32_t dfa_index; + int32_t quad_index; + int32_t single_index; + int32_t match_index; + int32_t match_start; +}; + +static void +acl_gen_log_stats(const struct rte_acl_ctx *ctx, + const struct acl_node_counters *counts, + const struct rte_acl_indices *indices, + size_t max_size) +{ + RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n" + "runtime memory footprint on socket %d:\n" + "single nodes/bytes used: %d/%zu\n" + "quad nodes/vectors/bytes used: %d/%d/%zu\n" + "DFA nodes/group64/bytes used: %d/%d/%zu\n" + "match nodes/bytes used: %d/%zu\n" + "total: %zu bytes\n" + "max limit: %zu bytes\n", + ctx->name, ctx->socket_id, + counts->single, counts->single * sizeof(uint64_t), + counts->quad, counts->quad_vectors, + (indices->quad_index - indices->dfa_index) * sizeof(uint64_t), + counts->dfa, counts->dfa_gr64, + indices->dfa_index * sizeof(uint64_t), + counts->match, + counts->match * sizeof(struct rte_acl_match_results), + ctx->mem_sz, + max_size); +} + +static uint64_t +acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index) +{ + uint64_t idx; + uint32_t i; + + idx = 0; + for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) { + RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM); + RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout); + idx |= (i - node->dfa_gr64[i]) << + (6 + RTE_ACL_DFA_GR64_BIT * i); + } + + return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type; +} + +static void +acl_dfa_fill_gr64(const struct rte_acl_node *node, + const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE]) +{ + uint32_t i; + + for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) { + memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE, + src + i * RTE_ACL_DFA_GR64_SIZE, + RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0])); + } +} + +static uint32_t +acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE], + uint8_t gr64[RTE_ACL_DFA_GR64_NUM]) +{ + uint32_t i, j, k; + + k = 0; + for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) { + gr64[i] = i; + for (j = 0; j != i; j++) { + if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE, + array_ptr + j * RTE_ACL_DFA_GR64_SIZE, + RTE_ACL_DFA_GR64_SIZE * + sizeof(array_ptr[0])) == 0) + break; + } + gr64[i] = (j != i) ? gr64[j] : k++; + } + + return k; +} + +static uint32_t +acl_node_fill_dfa(const struct rte_acl_node *node, + uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved) +{ + uint32_t n, x; + uint32_t ranges, last_bit; + struct rte_acl_node *child; + struct rte_acl_bitset *bits; + + ranges = 0; + last_bit = 0; + + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) + dfa[n] = no_match; + + for (x = 0; x < node->num_ptrs; x++) { + + child = node->ptrs[x].ptr; + if (child == NULL) + continue; + + bits = &node->ptrs[x].values; + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) { + + if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] & + (1 << (n % (sizeof(bits_t) * CHAR_BIT)))) { + + dfa[n] = resolved ? child->node_index : x; + ranges += (last_bit == 0); + last_bit = 1; + } else { + last_bit = 0; + } + } + } + + return ranges; +} + +/* +* Counts the number of groups of sequential bits that are +* either 0 or 1, as specified by the zero_one parameter. This is used to +* calculate the number of ranges in a node to see if it fits in a quad range +* node. +*/ +static int +acl_count_sequential_groups(struct rte_acl_bitset *bits, int zero_one) +{ + int n, ranges, last_bit; + + ranges = 0; + last_bit = zero_one ^ 1; + + for (n = QRANGE_MIN; n < UINT8_MAX + 1; n++) { + if (bits->bits[n / (sizeof(bits_t) * 8)] & + (1 << (n % (sizeof(bits_t) * 8)))) { + if (zero_one == 1 && last_bit != 1) + ranges++; + last_bit = 1; + } else { + if (zero_one == 0 && last_bit != 0) + ranges++; + last_bit = 0; + } + } + for (n = 0; n < QRANGE_MIN; n++) { + if (bits->bits[n / (sizeof(bits_t) * 8)] & + (1 << (n % (sizeof(bits_t) * 8)))) { + if (zero_one == 1 && last_bit != 1) + ranges++; + last_bit = 1; + } else { + if (zero_one == 0 && last_bit != 0) + ranges++; + last_bit = 0; + } + } + + return ranges; +} + +/* + * Count number of ranges spanned by the node's pointers + */ +static int +acl_count_fanout(struct rte_acl_node *node) +{ + uint32_t n; + int ranges; + + if (node->fanout != 0) + return node->fanout; + + ranges = acl_count_sequential_groups(&node->values, 0); + + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + ranges += acl_count_sequential_groups( + &node->ptrs[n].values, 1); + } + + node->fanout = ranges; + return node->fanout; +} + +/* + * Determine the type of nodes and count each type + */ +static void +acl_count_trie_types(struct acl_node_counters *counts, + struct rte_acl_node *node, uint64_t no_match, int force_dfa) +{ + uint32_t n; + int num_ptrs; + uint64_t dfa[RTE_ACL_DFA_SIZE]; + + /* skip if this node has been counted */ + if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED) + return; + + if (node->match_flag != 0 || node->num_ptrs == 0) { + counts->match++; + node->node_type = RTE_ACL_NODE_MATCH; + return; + } + + num_ptrs = acl_count_fanout(node); + + /* Force type to dfa */ + if (force_dfa) + num_ptrs = RTE_ACL_DFA_SIZE; + + /* determine node type based on number of ranges */ + if (num_ptrs == 1) { + counts->single++; + node->node_type = RTE_ACL_NODE_SINGLE; + } else if (num_ptrs <= RTE_ACL_QUAD_MAX) { + counts->quad++; + counts->quad_vectors += node->fanout; + node->node_type = RTE_ACL_NODE_QRANGE; + } else { + counts->dfa++; + node->node_type = RTE_ACL_NODE_DFA; + if (force_dfa != 0) { + /* always expand to a max number of nodes. */ + for (n = 0; n != RTE_DIM(node->dfa_gr64); n++) + node->dfa_gr64[n] = n; + node->fanout = n; + } else { + acl_node_fill_dfa(node, dfa, no_match, 0); + node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64); + } + counts->dfa_gr64 += node->fanout; + } + + /* + * recursively count the types of all children + */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + acl_count_trie_types(counts, node->ptrs[n].ptr, + no_match, 0); + } +} + +static void +acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match, + int resolved) +{ + uint32_t x; + int32_t m; + uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE]; + + acl_node_fill_dfa(node, dfa, no_match, resolved); + + /* + * Rather than going from 0 to 256, the range count and + * the layout are from 80-ff then 0-7f due to signed compare + * for SSE (cmpgt). + */ + if (node->node_type == RTE_ACL_NODE_QRANGE) { + + m = 0; + node_a = node_array; + index = dfa[QRANGE_MIN]; + *node_a++ = index; + + for (x = QRANGE_MIN + 1; x < UINT8_MAX + 1; x++) { + if (dfa[x] != index) { + index = dfa[x]; + *node_a++ = index; + node->transitions[m++] = (uint8_t)(x - 1); + } + } + + for (x = 0; x < INT8_MAX + 1; x++) { + if (dfa[x] != index) { + index = dfa[x]; + *node_a++ = index; + node->transitions[m++] = (uint8_t)(x - 1); + } + } + + /* fill unused locations with max value - nothing is greater */ + for (; m < RTE_ACL_QUAD_SIZE; m++) + node->transitions[m] = INT8_MAX; + + RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE); + + } else if (node->node_type == RTE_ACL_NODE_DFA && resolved) { + acl_dfa_fill_gr64(node, dfa, node_array); + } +} + +/* + * Routine that allocates space for this node and recursively calls + * to allocate space for each child. Once all the children are allocated, + * then resolve all transitions for this node. + */ +static void +acl_gen_node(struct rte_acl_node *node, uint64_t *node_array, + uint64_t no_match, struct rte_acl_indices *index, int num_categories) +{ + uint32_t n, sz, *qtrp; + uint64_t *array_ptr; + struct rte_acl_match_results *match; + + if (node->node_index != RTE_ACL_NODE_UNDEFINED) + return; + + array_ptr = NULL; + + switch (node->node_type) { + case RTE_ACL_NODE_DFA: + array_ptr = &node_array[index->dfa_index]; + node->node_index = acl_dfa_gen_idx(node, index->dfa_index); + sz = node->fanout * RTE_ACL_DFA_GR64_SIZE; + index->dfa_index += sz; + for (n = 0; n < sz; n++) + array_ptr[n] = no_match; + break; + case RTE_ACL_NODE_SINGLE: + node->node_index = RTE_ACL_QUAD_SINGLE | index->single_index | + node->node_type; + array_ptr = &node_array[index->single_index]; + index->single_index += 1; + array_ptr[0] = no_match; + break; + case RTE_ACL_NODE_QRANGE: + array_ptr = &node_array[index->quad_index]; + acl_add_ptrs(node, array_ptr, no_match, 0); + qtrp = (uint32_t *)node->transitions; + node->node_index = qtrp[0]; + node->node_index <<= sizeof(index->quad_index) * CHAR_BIT; + node->node_index |= index->quad_index | node->node_type; + index->quad_index += node->fanout; + break; + case RTE_ACL_NODE_MATCH: + match = ((struct rte_acl_match_results *) + (node_array + index->match_start)); + for (n = 0; n != RTE_DIM(match->results); n++) + RTE_ACL_VERIFY(match->results[0] == 0); + memcpy(match + index->match_index, node->mrt, + sizeof(*node->mrt)); + node->node_index = index->match_index | node->node_type; + index->match_index += 1; + break; + case RTE_ACL_NODE_UNDEFINED: + RTE_ACL_VERIFY(node->node_type != + (uint32_t)RTE_ACL_NODE_UNDEFINED); + break; + } + + /* recursively allocate space for all children */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + acl_gen_node(node->ptrs[n].ptr, + node_array, + no_match, + index, + num_categories); + } + + /* All children are resolved, resolve this node's pointers */ + switch (node->node_type) { + case RTE_ACL_NODE_DFA: + acl_add_ptrs(node, array_ptr, no_match, 1); + break; + case RTE_ACL_NODE_SINGLE: + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + array_ptr[0] = node->ptrs[n].ptr->node_index; + } + break; + case RTE_ACL_NODE_QRANGE: + acl_add_ptrs(node, array_ptr, no_match, 1); + break; + case RTE_ACL_NODE_MATCH: + break; + case RTE_ACL_NODE_UNDEFINED: + RTE_ACL_VERIFY(node->node_type != + (uint32_t)RTE_ACL_NODE_UNDEFINED); + break; + } +} + +static void +acl_calc_counts_indices(struct acl_node_counters *counts, + struct rte_acl_indices *indices, + struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, + uint64_t no_match) +{ + uint32_t n; + + memset(indices, 0, sizeof(*indices)); + memset(counts, 0, sizeof(*counts)); + + /* Get stats on nodes */ + for (n = 0; n < num_tries; n++) { + acl_count_trie_types(counts, node_bld_trie[n].trie, + no_match, 1); + } + + indices->dfa_index = RTE_ACL_DFA_SIZE + 1; + indices->quad_index = indices->dfa_index + + counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE; + indices->single_index = indices->quad_index + counts->quad_vectors; + indices->match_start = indices->single_index + counts->single + 1; + indices->match_start = RTE_ALIGN(indices->match_start, + (XMM_SIZE / sizeof(uint64_t))); + indices->match_index = 1; +} + +/* + * Generate the runtime structure using build structure + */ +int +rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, + struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, + uint32_t num_categories, uint32_t data_index_sz, size_t max_size) +{ + void *mem; + size_t total_size; + uint64_t *node_array, no_match; + uint32_t n, match_index; + struct rte_acl_match_results *match; + struct acl_node_counters counts; + struct rte_acl_indices indices; + + no_match = RTE_ACL_NODE_MATCH; + + /* Fill counts and indices arrays from the nodes. */ + acl_calc_counts_indices(&counts, &indices, + node_bld_trie, num_tries, no_match); + + /* Allocate runtime memory (align to cache boundary) */ + total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) + + indices.match_start * sizeof(uint64_t) + + (counts.match + 1) * sizeof(struct rte_acl_match_results) + + XMM_SIZE; + + if (total_size > max_size) { + RTE_LOG(DEBUG, ACL, + "Gen phase for ACL ctx \"%s\" exceeds max_size limit, " + "bytes required: %zu, allowed: %zu\n", + ctx->name, total_size, max_size); + return -ERANGE; + } + + mem = rte_zmalloc_socket(ctx->name, total_size, RTE_CACHE_LINE_SIZE, + ctx->socket_id); + if (mem == NULL) { + RTE_LOG(ERR, ACL, + "allocation of %zu bytes on socket %d for %s failed\n", + total_size, ctx->socket_id, ctx->name); + return -ENOMEM; + } + + /* Fill the runtime structure */ + match_index = indices.match_start; + node_array = (uint64_t *)((uintptr_t)mem + + RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE)); + + /* + * Setup the NOMATCH node (a SINGLE at the + * highest index, that points to itself) + */ + + node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE; + + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) + node_array[n] = no_match; + + /* NOMATCH result at index 0 */ + match = ((struct rte_acl_match_results *)(node_array + match_index)); + memset(match, 0, sizeof(*match)); + + for (n = 0; n < num_tries; n++) { + + acl_gen_node(node_bld_trie[n].trie, node_array, no_match, + &indices, num_categories); + + if (node_bld_trie[n].trie->node_index == no_match) + trie[n].root_index = 0; + else + trie[n].root_index = node_bld_trie[n].trie->node_index; + } + + ctx->mem = mem; + ctx->mem_sz = total_size; + ctx->data_indexes = mem; + ctx->num_tries = num_tries; + ctx->num_categories = num_categories; + ctx->match_index = match_index; + ctx->no_match = no_match; + ctx->idle = node_array[RTE_ACL_DFA_SIZE]; + ctx->trans_table = node_array; + memcpy(ctx->trie, trie, sizeof(ctx->trie)); + + acl_gen_log_stats(ctx, &counts, &indices, max_size); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run.h b/src/spdk/dpdk/lib/librte_acl/acl_run.h new file mode 100644 index 00000000..bf7842d8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _ACL_RUN_H_ +#define _ACL_RUN_H_ + +#include +#include "acl.h" + +#define MAX_SEARCHES_AVX16 16 +#define MAX_SEARCHES_SSE8 8 +#define MAX_SEARCHES_ALTIVEC8 8 +#define MAX_SEARCHES_SSE4 4 +#define MAX_SEARCHES_ALTIVEC4 4 +#define MAX_SEARCHES_SCALAR 2 + +#define GET_NEXT_4BYTES(prm, idx) \ + (*((const int32_t *)((prm)[(idx)].data + *(prm)[idx].data_index++))) + + +#define RTE_ACL_NODE_INDEX ((uint32_t)~RTE_ACL_NODE_TYPE) + +#define SCALAR_QRANGE_MULT 0x01010101 +#define SCALAR_QRANGE_MASK 0x7f7f7f7f +#define SCALAR_QRANGE_MIN 0x80808080 + +/* + * Structure to manage N parallel trie traversals. + * The runtime trie traversal routines can process 8, 4, or 2 tries + * in parallel. Each packet may require multiple trie traversals (up to 4). + * This structure is used to fill the slots (0 to n-1) for parallel processing + * with the trie traversals needed for each packet. + */ +struct acl_flow_data { + uint32_t num_packets; + /* number of packets processed */ + uint32_t started; + /* number of trie traversals in progress */ + uint32_t trie; + /* current trie index (0 to N-1) */ + uint32_t cmplt_size; + /* maximum number of packets to process */ + uint32_t total_packets; + /* number of result categories per packet. */ + uint32_t categories; + const uint64_t *trans; + const uint8_t **data; + uint32_t *results; + struct completion *last_cmplt; + struct completion *cmplt_array; +}; + +/* + * Structure to maintain running results for + * a single packet (up to 4 tries). + */ +struct completion { + uint32_t *results; /* running results. */ + int32_t priority[RTE_ACL_MAX_CATEGORIES]; /* running priorities. */ + uint32_t count; /* num of remaining tries */ + /* true for allocated struct */ +} __attribute__((aligned(XMM_SIZE))); + +/* + * One parms structure for each slot in the search engine. + */ +struct parms { + const uint8_t *data; + /* input data for this packet */ + const uint32_t *data_index; + /* data indirection for this trie */ + struct completion *cmplt; + /* completion data for this packet */ +}; + +/* + * Define an global idle node for unused engine slots + */ +static const uint32_t idle[UINT8_MAX + 1]; + +/* + * Allocate a completion structure to manage the tries for a packet. + */ +static inline struct completion * +alloc_completion(struct completion *p, uint32_t size, uint32_t tries, + uint32_t *results) +{ + uint32_t n; + + for (n = 0; n < size; n++) { + + if (p[n].count == 0) { + + /* mark as allocated and set number of tries. */ + p[n].count = tries; + p[n].results = results; + return &(p[n]); + } + } + + /* should never get here */ + return NULL; +} + +/* + * Resolve priority for a single result trie. + */ +static inline void +resolve_single_priority(uint64_t transition, int n, + const struct rte_acl_ctx *ctx, struct parms *parms, + const struct rte_acl_match_results *p) +{ + if (parms[n].cmplt->count == ctx->num_tries || + parms[n].cmplt->priority[0] <= + p[transition].priority[0]) { + + parms[n].cmplt->priority[0] = p[transition].priority[0]; + parms[n].cmplt->results[0] = p[transition].results[0]; + } +} + +/* + * Routine to fill a slot in the parallel trie traversal array (parms) from + * the list of packets (flows). + */ +static inline uint64_t +acl_start_next_trie(struct acl_flow_data *flows, struct parms *parms, int n, + const struct rte_acl_ctx *ctx) +{ + uint64_t transition; + + /* if there are any more packets to process */ + if (flows->num_packets < flows->total_packets) { + parms[n].data = flows->data[flows->num_packets]; + parms[n].data_index = ctx->trie[flows->trie].data_index; + + /* if this is the first trie for this packet */ + if (flows->trie == 0) { + flows->last_cmplt = alloc_completion(flows->cmplt_array, + flows->cmplt_size, ctx->num_tries, + flows->results + + flows->num_packets * flows->categories); + } + + /* set completion parameters and starting index for this slot */ + parms[n].cmplt = flows->last_cmplt; + transition = + flows->trans[parms[n].data[*parms[n].data_index++] + + ctx->trie[flows->trie].root_index]; + + /* + * if this is the last trie for this packet, + * then setup next packet. + */ + flows->trie++; + if (flows->trie >= ctx->num_tries) { + flows->trie = 0; + flows->num_packets++; + } + + /* keep track of number of active trie traversals */ + flows->started++; + + /* no more tries to process, set slot to an idle position */ + } else { + transition = ctx->idle; + parms[n].data = (const uint8_t *)idle; + parms[n].data_index = idle; + } + return transition; +} + +static inline void +acl_set_flow(struct acl_flow_data *flows, struct completion *cmplt, + uint32_t cmplt_size, const uint8_t **data, uint32_t *results, + uint32_t data_num, uint32_t categories, const uint64_t *trans) +{ + flows->num_packets = 0; + flows->started = 0; + flows->trie = 0; + flows->last_cmplt = NULL; + flows->cmplt_array = cmplt; + flows->total_packets = data_num; + flows->categories = categories; + flows->cmplt_size = cmplt_size; + flows->data = data; + flows->results = results; + flows->trans = trans; +} + +typedef void (*resolve_priority_t) +(uint64_t transition, int n, const struct rte_acl_ctx *ctx, + struct parms *parms, const struct rte_acl_match_results *p, + uint32_t categories); + +/* + * Detect matches. If a match node transition is found, then this trie + * traversal is complete and fill the slot with the next trie + * to be processed. + */ +static inline uint64_t +acl_match_check(uint64_t transition, int slot, + const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, resolve_priority_t resolve_priority) +{ + const struct rte_acl_match_results *p; + + p = (const struct rte_acl_match_results *) + (flows->trans + ctx->match_index); + + if (transition & RTE_ACL_NODE_MATCH) { + + /* Remove flags from index and decrement active traversals */ + transition &= RTE_ACL_NODE_INDEX; + flows->started--; + + /* Resolve priorities for this trie and running results */ + if (flows->categories == 1) + resolve_single_priority(transition, slot, ctx, + parms, p); + else + resolve_priority(transition, slot, ctx, parms, + p, flows->categories); + + /* Count down completed tries for this search request */ + parms[slot].cmplt->count--; + + /* Fill the slot with the next trie or idle trie */ + transition = acl_start_next_trie(flows, parms, slot, ctx); + } + + return transition; +} + +#endif /* _ACL_RUN_H_ */ diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_altivec.c b/src/spdk/dpdk/lib/librte_acl/acl_run_altivec.c new file mode 100644 index 00000000..35235260 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_altivec.c @@ -0,0 +1,47 @@ +/*- + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2016. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "acl_run_altivec.h" + +int +rte_acl_classify_altivec(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= MAX_SEARCHES_ALTIVEC8)) + return search_altivec_8(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_ALTIVEC4) + return search_altivec_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_altivec.h b/src/spdk/dpdk/lib/librte_acl/acl_run_altivec.h new file mode 100644 index 00000000..62fd6a22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_altivec.h @@ -0,0 +1,329 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "acl_run.h" +#include "acl_vect.h" + +struct _altivec_acl_const { + rte_xmm_t xmm_shuffle_input; + rte_xmm_t xmm_index_mask; + rte_xmm_t xmm_ones_16; + rte_xmm_t range_base; +} altivec_acl_const __attribute__((aligned(RTE_CACHE_LINE_SIZE))) = { + { + .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c} + }, + { + .u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX} + }, + { + .u16 = {1, 1, 1, 1, 1, 1, 1, 1} + }, + { + .u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c} + }, +}; + +/* + * Resolve priority for multiple results (altivec version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_altivec(uint64_t transition, int n, + const struct rte_acl_ctx *ctx, struct parms *parms, + const struct rte_acl_match_results *p, uint32_t categories) +{ + uint32_t x; + xmm_t results, priority, results1, priority1; + vector bool int selector; + xmm_t *saved_results, *saved_priority; + + for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { + + saved_results = (xmm_t *)(&parms[n].cmplt->results[x]); + saved_priority = + (xmm_t *)(&parms[n].cmplt->priority[x]); + + /* get results and priorities for completed trie */ + results = *(const xmm_t *)&p[transition].results[x]; + priority = *(const xmm_t *)&p[transition].priority[x]; + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + + /* get running best results and their priorities */ + results1 = *saved_results; + priority1 = *saved_priority; + + /* select results that are highest priority */ + selector = vec_cmpgt(priority1, priority); + results = vec_sel(results, results1, selector); + priority = vec_sel(priority, priority1, + selector); + } + + /* save running best results and their priorities */ + *saved_results = results; + *saved_priority = priority; + } +} + +/* + * Check for any match in 4 transitions + */ +static __rte_always_inline uint32_t +check_any_match_x4(uint64_t val[]) +{ + return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; +} + +static __rte_always_inline void +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, uint64_t transitions[]) +{ + while (check_any_match_x4(transitions)) { + transitions[0] = acl_match_check(transitions[0], slot, ctx, + parms, flows, resolve_priority_altivec); + transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, + parms, flows, resolve_priority_altivec); + transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, + parms, flows, resolve_priority_altivec); + transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, + parms, flows, resolve_priority_altivec); + } +} + +/* + * Process 4 transitions (in 2 XMM registers) in parallel + */ +static inline __attribute__((optimize("O2"))) xmm_t +transition4(xmm_t next_input, const uint64_t *trans, + xmm_t *indices1, xmm_t *indices2) +{ + xmm_t addr, tr_lo, tr_hi; + xmm_t in, node_type, r, t; + xmm_t dfa_ofs, quad_ofs; + xmm_t *index_mask, *tp; + vector bool int dfa_msk; + vector signed char zeroes = {}; + union { + uint64_t d64[2]; + uint32_t d32[4]; + } v; + + /* Move low 32 into tr_lo and high 32 into tr_hi */ + tr_lo = (xmm_t){(*indices1)[0], (*indices1)[2], + (*indices2)[0], (*indices2)[2]}; + tr_hi = (xmm_t){(*indices1)[1], (*indices1)[3], + (*indices2)[1], (*indices2)[3]}; + + /* Calculate the address (array index) for all 4 transitions. */ + index_mask = (xmm_t *)&altivec_acl_const.xmm_index_mask.u32; + t = vec_xor(*index_mask, *index_mask); + in = vec_perm(next_input, (xmm_t){}, + *(vector unsigned char *)&altivec_acl_const.xmm_shuffle_input); + + /* Calc node type and node addr */ + node_type = vec_and(vec_nor(*index_mask, *index_mask), tr_lo); + addr = vec_and(tr_lo, *index_mask); + + /* mask for DFA type(0) nodes */ + dfa_msk = vec_cmpeq(node_type, t); + + /* DFA calculations. */ + r = vec_sr(in, (vector unsigned int){30, 30, 30, 30}); + tp = (xmm_t *)&altivec_acl_const.range_base.u32; + r = vec_add(r, *tp); + t = vec_sr(in, (vector unsigned int){24, 24, 24, 24}); + r = vec_perm(tr_hi, (xmm_t){(uint16_t)0 << 16}, + (vector unsigned char)r); + + dfa_ofs = vec_sub(t, r); + + /* QUAD/SINGLE caluclations. */ + t = (xmm_t)vec_cmpgt((vector signed char)in, (vector signed char)tr_hi); + t = (xmm_t)vec_sel( + vec_sel( + (vector signed char)vec_sub( + zeroes, (vector signed char)t), + (vector signed char)t, + vec_cmpgt((vector signed char)t, zeroes)), + zeroes, + vec_cmpeq((vector signed char)t, zeroes)); + + t = (xmm_t)vec_msum((vector signed char)t, + (vector unsigned char)t, (xmm_t){}); + quad_ofs = (xmm_t)vec_msum((vector signed short)t, + *(vector signed short *)&altivec_acl_const.xmm_ones_16.u16, + (xmm_t){}); + + /* blend DFA and QUAD/SINGLE. */ + t = vec_sel(quad_ofs, dfa_ofs, dfa_msk); + + /* calculate address for next transitions. */ + addr = vec_add(addr, t); + + v.d64[0] = (uint64_t)trans[addr[0]]; + v.d64[1] = (uint64_t)trans[addr[1]]; + *indices1 = (xmm_t){v.d32[0], v.d32[1], v.d32[2], v.d32[3]}; + v.d64[0] = (uint64_t)trans[addr[2]]; + v.d64[1] = (uint64_t)trans[addr[3]]; + *indices2 = (xmm_t){v.d32[0], v.d32[1], v.d32[2], v.d32[3]}; + + return vec_sr(next_input, + (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT}); +} + +/* + * Execute trie traversal with 8 traversals in parallel + */ +static inline int +search_altivec_8(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_ALTIVEC8]; + struct completion cmplt[MAX_SEARCHES_ALTIVEC8]; + struct parms parms[MAX_SEARCHES_ALTIVEC8]; + xmm_t input0, input1; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_ALTIVEC8; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, (uint64_t *)&index_array[0]); + acl_match_check_x4(4, ctx, parms, &flows, (uint64_t *)&index_array[4]); + + while (flows.started > 0) { + + /* Gather 4 bytes of input data for each stream. */ + input0 = (xmm_t){GET_NEXT_4BYTES(parms, 0), + GET_NEXT_4BYTES(parms, 1), + GET_NEXT_4BYTES(parms, 2), + GET_NEXT_4BYTES(parms, 3)}; + + input1 = (xmm_t){GET_NEXT_4BYTES(parms, 4), + GET_NEXT_4BYTES(parms, 5), + GET_NEXT_4BYTES(parms, 6), + GET_NEXT_4BYTES(parms, 7)}; + + /* Process the 4 bytes of input on each stream. */ + + input0 = transition4(input0, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input1 = transition4(input1, flows.trans, + (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); + + input0 = transition4(input0, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input1 = transition4(input1, flows.trans, + (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); + + input0 = transition4(input0, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input1 = transition4(input1, flows.trans, + (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); + + input0 = transition4(input0, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input1 = transition4(input1, flows.trans, + (xmm_t *)&index_array[4], (xmm_t *)&index_array[6]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + (uint64_t *)&index_array[0]); + acl_match_check_x4(4, ctx, parms, &flows, + (uint64_t *)&index_array[4]); + } + + return 0; +} + +/* + * Execute trie traversal with 4 traversals in parallel + */ +static inline int +search_altivec_4(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, int total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_ALTIVEC4]; + struct completion cmplt[MAX_SEARCHES_ALTIVEC4]; + struct parms parms[MAX_SEARCHES_ALTIVEC4]; + xmm_t input; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_ALTIVEC4; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, index_array); + + while (flows.started > 0) { + + /* Gather 4 bytes of input data for each stream. */ + input = (xmm_t){GET_NEXT_4BYTES(parms, 0), + GET_NEXT_4BYTES(parms, 1), + GET_NEXT_4BYTES(parms, 2), + GET_NEXT_4BYTES(parms, 3)}; + + /* Process the 4 bytes of input on each stream. */ + input = transition4(input, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input = transition4(input, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input = transition4(input, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + input = transition4(input, flows.trans, + (xmm_t *)&index_array[0], (xmm_t *)&index_array[2]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, index_array); + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_avx2.c b/src/spdk/dpdk/lib/librte_acl/acl_run_avx2.c new file mode 100644 index 00000000..02f8ce52 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_avx2.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + + +#include "acl_run_avx2.h" + +/* + * Note, that to be able to use AVX2 classify method, + * both compiler and target cpu have to support AVX2 instructions. + */ +int +rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= MAX_SEARCHES_AVX16)) + return search_avx2x16(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_SSE8) + return search_sse_8(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_SSE4) + return search_sse_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_avx2.h b/src/spdk/dpdk/lib/librte_acl/acl_run_avx2.h new file mode 100644 index 00000000..d06d2e87 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_avx2.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include "acl_run_sse.h" + +static const rte_ymm_t ymm_match_mask = { + .u32 = { + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + }, +}; + +static const rte_ymm_t ymm_index_mask = { + .u32 = { + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + }, +}; + +static const rte_ymm_t ymm_shuffle_input = { + .u32 = { + 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, + 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, + }, +}; + +static const rte_ymm_t ymm_ones_16 = { + .u16 = { + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + }, +}; + +static const rte_ymm_t ymm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; + +/* + * Process 8 transitions in parallel. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + * next_input contains up to 4 input bytes for 8 flows. + */ +static __rte_always_inline ymm_t +transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) +{ + const int32_t *tr; + ymm_t addr; + + tr = (const int32_t *)(uintptr_t)trans; + + /* Calculate the address (array index) for all 8 transitions. */ + ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, + ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y, + *tr_lo, *tr_hi); + + /* load lower 32 bits of 8 transactions at once. */ + *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); + + next_input = _mm256_srli_epi32(next_input, CHAR_BIT); + + /* load high 32 bits of 8 transactions at once. */ + *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0])); + + return next_input; +} + +/* + * Process matches for 8 flows. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + */ +static inline void +acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, + struct parms *parms, struct acl_flow_data *flows, uint32_t slot, + ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi) +{ + ymm_t t0, t1; + ymm_t lo, hi; + xmm_t l0, l1; + uint32_t i; + uint64_t tr[MAX_SEARCHES_SSE8]; + + l1 = _mm256_extracti128_si256(*tr_lo, 1); + l0 = _mm256_castsi256_si128(*tr_lo); + + for (i = 0; i != RTE_DIM(tr) / 2; i++) { + + /* + * Extract low 32bits of each transition. + * That's enough to process the match. + */ + tr[i] = (uint32_t)_mm_cvtsi128_si32(l0); + tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1); + + l0 = _mm_srli_si128(l0, sizeof(uint32_t)); + l1 = _mm_srli_si128(l1, sizeof(uint32_t)); + + tr[i] = acl_match_check(tr[i], slot + i, + ctx, parms, flows, resolve_priority_sse); + tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4, + ctx, parms, flows, resolve_priority_sse); + } + + /* Collect new transitions into 2 YMM registers. */ + t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]); + t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]); + + /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */ + ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi); + + /* Keep transitions wth NOMATCH intact. */ + *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches); + *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches); +} + +static inline void +acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, uint32_t slot, + ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask) +{ + uint32_t msk; + ymm_t matches, temp; + + /* test for match node */ + temp = _mm256_and_si256(match_mask, *tr_lo); + matches = _mm256_cmpeq_epi32(temp, match_mask); + msk = _mm256_movemask_epi8(matches); + + while (msk != 0) { + + acl_process_matches_avx2x8(ctx, parms, flows, slot, + matches, tr_lo, tr_hi); + temp = _mm256_and_si256(match_mask, *tr_lo); + matches = _mm256_cmpeq_epi32(temp, match_mask); + msk = _mm256_movemask_epi8(matches); + } +} + +/* + * Execute trie traversal for up to 16 flows in parallel. + */ +static inline int +search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + uint32_t n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_AVX16]; + struct completion cmplt[MAX_SEARCHES_AVX16]; + struct parms parms[MAX_SEARCHES_AVX16]; + ymm_t input[2], tr_lo[2], tr_hi[2]; + ymm_t t0, t1; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < RTE_DIM(cmplt); n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + t0 = _mm256_set_epi64x(index_array[5], index_array[4], + index_array[1], index_array[0]); + t1 = _mm256_set_epi64x(index_array[7], index_array[6], + index_array[3], index_array[2]); + + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]); + + t0 = _mm256_set_epi64x(index_array[13], index_array[12], + index_array[9], index_array[8]); + t1 = _mm256_set_epi64x(index_array[15], index_array[14], + index_array[11], index_array[10]); + + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]); + + /* Check for any matches. */ + acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], + ymm_match_mask.y); + acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], + ymm_match_mask.y); + + while (flows.started > 0) { + + uint32_t in[MAX_SEARCHES_SSE8]; + + /* Gather 4 bytes of input data for first 8 flows. */ + in[0] = GET_NEXT_4BYTES(parms, 0); + in[4] = GET_NEXT_4BYTES(parms, 4); + in[1] = GET_NEXT_4BYTES(parms, 1); + in[5] = GET_NEXT_4BYTES(parms, 5); + in[2] = GET_NEXT_4BYTES(parms, 2); + in[6] = GET_NEXT_4BYTES(parms, 6); + in[3] = GET_NEXT_4BYTES(parms, 3); + in[7] = GET_NEXT_4BYTES(parms, 7); + input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4], + in[3], in[2], in[1], in[0]); + + /* Gather 4 bytes of input data for last 8 flows. */ + in[0] = GET_NEXT_4BYTES(parms, 8); + in[4] = GET_NEXT_4BYTES(parms, 12); + in[1] = GET_NEXT_4BYTES(parms, 9); + in[5] = GET_NEXT_4BYTES(parms, 13); + in[2] = GET_NEXT_4BYTES(parms, 10); + in[6] = GET_NEXT_4BYTES(parms, 14); + in[3] = GET_NEXT_4BYTES(parms, 11); + in[7] = GET_NEXT_4BYTES(parms, 15); + input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4], + in[3], in[2], in[1], in[0]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + /* Check for any matches. */ + acl_match_check_avx2x8(ctx, parms, &flows, 0, + &tr_lo[0], &tr_hi[0], ymm_match_mask.y); + acl_match_check_avx2x8(ctx, parms, &flows, 8, + &tr_lo[1], &tr_hi[1], ymm_match_mask.y); + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_neon.c b/src/spdk/dpdk/lib/librte_acl/acl_run_neon.c new file mode 100644 index 00000000..7319d612 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_neon.c @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#include "acl_run_neon.h" + +int +rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= 8)) + return search_neon_8(ctx, data, results, num, categories); + else if (num >= 4) + return search_neon_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_neon.h b/src/spdk/dpdk/lib/librte_acl/acl_run_neon.h new file mode 100644 index 00000000..01b9766d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_neon.h @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#include "acl_run.h" +#include "acl_vect.h" + +struct _neon_acl_const { + rte_xmm_t xmm_shuffle_input; + rte_xmm_t xmm_index_mask; + rte_xmm_t range_base; +} neon_acl_const __attribute__((aligned(RTE_CACHE_LINE_SIZE))) = { + { + .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c} + }, + { + .u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX} + }, + { + .u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c} + }, +}; + +/* + * Resolve priority for multiple results (neon version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx, + struct parms *parms, + const struct rte_acl_match_results *p, + uint32_t categories) +{ + uint32_t x; + int32x4_t results, priority, results1, priority1; + uint32x4_t selector; + int32_t *saved_results, *saved_priority; + + for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { + saved_results = (int32_t *)(&parms[n].cmplt->results[x]); + saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]); + + /* get results and priorities for completed trie */ + results = vld1q_s32( + (const int32_t *)&p[transition].results[x]); + priority = vld1q_s32( + (const int32_t *)&p[transition].priority[x]); + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + /* get running best results and their priorities */ + results1 = vld1q_s32(saved_results); + priority1 = vld1q_s32(saved_priority); + + /* select results that are highest priority */ + selector = vcgtq_s32(priority1, priority); + results = vbslq_s32(selector, results1, results); + priority = vbslq_s32(selector, priority1, priority); + } + + /* save running best results and their priorities */ + vst1q_s32(saved_results, results); + vst1q_s32(saved_priority, priority); + } +} + +/* + * Check for any match in 4 transitions + */ +static __rte_always_inline uint32_t +check_any_match_x4(uint64_t val[]) +{ + return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; +} + +static __rte_always_inline void +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, uint64_t transitions[]) +{ + while (check_any_match_x4(transitions)) { + transitions[0] = acl_match_check(transitions[0], slot, ctx, + parms, flows, resolve_priority_neon); + transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, + parms, flows, resolve_priority_neon); + transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, + parms, flows, resolve_priority_neon); + transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, + parms, flows, resolve_priority_neon); + } +} + +/* + * Process 4 transitions (in 2 NEON Q registers) in parallel + */ +static __rte_always_inline int32x4_t +transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[]) +{ + int32x4x2_t tr_hi_lo; + int32x4_t t, in, r; + uint32x4_t index_msk, node_type, addr; + uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs; + + /* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */ + tr_hi_lo = vld2q_s32((const int32_t *)transitions); + + /* Calculate the address (array index) for all 4 transitions. */ + + index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask); + + /* Calc node type and node addr */ + node_type = vbicq_s32(tr_hi_lo.val[0], index_msk); + addr = vandq_s32(tr_hi_lo.val[0], index_msk); + + /* t = 0 */ + t = veorq_s32(node_type, node_type); + + /* mask for DFA type(0) nodes */ + dfa_msk = vceqq_u32(node_type, t); + + mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input); + in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask); + + /* DFA calculations. */ + r = vshrq_n_u32(in, 30); /* div by 64 */ + mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base); + r = vaddq_u8(r, mask); + t = vshrq_n_u32(in, 24); + r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r); + dfa_ofs = vsubq_s32(t, r); + + /* QUAD/SINGLE calculations. */ + t = vcgtq_s8(in, tr_hi_lo.val[1]); + t = vabsq_s8(t); + t = vpaddlq_u8(t); + quad_ofs = vpaddlq_u16(t); + + /* blend DFA and QUAD/SINGLE. */ + t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs); + + /* calculate address for next transitions */ + addr = vaddq_u32(addr, t); + + /* Fill next transitions */ + transitions[0] = trans[vgetq_lane_u32(addr, 0)]; + transitions[1] = trans[vgetq_lane_u32(addr, 1)]; + transitions[2] = trans[vgetq_lane_u32(addr, 2)]; + transitions[3] = trans[vgetq_lane_u32(addr, 3)]; + + return vshrq_n_u32(next_input, CHAR_BIT); +} + +/* + * Execute trie traversal with 8 traversals in parallel + */ +static inline int +search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[8]; + struct completion cmplt[8]; + struct parms parms[8]; + int32x4_t input0, input1; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < 8; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); + acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); + + while (flows.started > 0) { + /* Gather 4 bytes of input data for each stream. */ + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0); + + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1); + + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2); + + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3); + + /* Process the 4 bytes of input on each stream. */ + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); + acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); + } + + return 0; +} + +/* + * Execute trie traversal with 4 traversals in parallel + */ +static inline int +search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, int total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[4]; + struct completion cmplt[4]; + struct parms parms[4]; + int32x4_t input; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < 4; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, index_array); + + while (flows.started > 0) { + /* Gather 4 bytes of input data for each stream. */ + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0); + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1); + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2); + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3); + + /* Process the 4 bytes of input on each stream. */ + input = transition4(input, flows.trans, index_array); + input = transition4(input, flows.trans, index_array); + input = transition4(input, flows.trans, index_array); + input = transition4(input, flows.trans, index_array); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, index_array); + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_scalar.c b/src/spdk/dpdk/lib/librte_acl/acl_run_scalar.c new file mode 100644 index 00000000..3d61e794 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_scalar.c @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include "acl_run.h" + +/* + * Resolve priority for multiple results (scalar version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_scalar(uint64_t transition, int n, + const struct rte_acl_ctx *ctx, struct parms *parms, + const struct rte_acl_match_results *p, uint32_t categories) +{ + uint32_t i; + int32_t *saved_priority; + uint32_t *saved_results; + const int32_t *priority; + const uint32_t *results; + + saved_results = parms[n].cmplt->results; + saved_priority = parms[n].cmplt->priority; + + /* results and priorities for completed trie */ + results = p[transition].results; + priority = p[transition].priority; + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + for (i = 0; i < categories; i += RTE_ACL_RESULTS_MULTIPLIER) { + + if (saved_priority[i] <= priority[i]) { + saved_priority[i] = priority[i]; + saved_results[i] = results[i]; + } + if (saved_priority[i + 1] <= priority[i + 1]) { + saved_priority[i + 1] = priority[i + 1]; + saved_results[i + 1] = results[i + 1]; + } + if (saved_priority[i + 2] <= priority[i + 2]) { + saved_priority[i + 2] = priority[i + 2]; + saved_results[i + 2] = results[i + 2]; + } + if (saved_priority[i + 3] <= priority[i + 3]) { + saved_priority[i + 3] = priority[i + 3]; + saved_results[i + 3] = results[i + 3]; + } + } + } else { + for (i = 0; i < categories; i += RTE_ACL_RESULTS_MULTIPLIER) { + saved_priority[i] = priority[i]; + saved_priority[i + 1] = priority[i + 1]; + saved_priority[i + 2] = priority[i + 2]; + saved_priority[i + 3] = priority[i + 3]; + + saved_results[i] = results[i]; + saved_results[i + 1] = results[i + 1]; + saved_results[i + 2] = results[i + 2]; + saved_results[i + 3] = results[i + 3]; + } + } +} + +static inline uint32_t +scan_forward(uint32_t input, uint32_t max) +{ + return (input == 0) ? max : rte_bsf32(input); +} + +static inline uint64_t +scalar_transition(const uint64_t *trans_table, uint64_t transition, + uint8_t input) +{ + uint32_t addr, index, ranges, x, a, b, c; + + /* break transition into component parts */ + ranges = transition >> (sizeof(index) * CHAR_BIT); + index = transition & ~RTE_ACL_NODE_INDEX; + addr = transition ^ index; + + if (index != RTE_ACL_NODE_DFA) { + /* calc address for a QRANGE/SINGLE node */ + c = (uint32_t)input * SCALAR_QRANGE_MULT; + a = ranges | SCALAR_QRANGE_MIN; + a -= (c & SCALAR_QRANGE_MASK); + b = c & SCALAR_QRANGE_MIN; + a &= SCALAR_QRANGE_MIN; + a ^= (ranges ^ b) & (a ^ b); + x = scan_forward(a, 32) >> 3; + } else { + /* calc address for a DFA node */ + x = ranges >> (input / + RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT); + x &= UINT8_MAX; + x = input - x; + } + + addr += x; + + /* pickup next transition */ + transition = *(trans_table + addr); + return transition; +} + +int +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + int n; + uint64_t transition0, transition1; + uint32_t input0, input1; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_SCALAR]; + struct completion cmplt[MAX_SEARCHES_SCALAR]; + struct parms parms[MAX_SEARCHES_SCALAR]; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, num, + categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_SCALAR; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + transition0 = index_array[0]; + transition1 = index_array[1]; + + while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) { + transition0 = acl_match_check(transition0, + 0, ctx, parms, &flows, resolve_priority_scalar); + transition1 = acl_match_check(transition1, + 1, ctx, parms, &flows, resolve_priority_scalar); + } + + while (flows.started > 0) { + + input0 = GET_NEXT_4BYTES(parms, 0); + input1 = GET_NEXT_4BYTES(parms, 1); + + for (n = 0; n < 4; n++) { + + transition0 = scalar_transition(flows.trans, + transition0, (uint8_t)input0); + input0 >>= CHAR_BIT; + + transition1 = scalar_transition(flows.trans, + transition1, (uint8_t)input1); + input1 >>= CHAR_BIT; + } + + while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) { + transition0 = acl_match_check(transition0, + 0, ctx, parms, &flows, resolve_priority_scalar); + transition1 = acl_match_check(transition1, + 1, ctx, parms, &flows, resolve_priority_scalar); + } + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_sse.c b/src/spdk/dpdk/lib/librte_acl/acl_run_sse.c new file mode 100644 index 00000000..1788200f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_sse.c @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include "acl_run_sse.h" + +int +rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= MAX_SEARCHES_SSE8)) + return search_sse_8(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_SSE4) + return search_sse_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_run_sse.h b/src/spdk/dpdk/lib/librte_acl/acl_run_sse.h new file mode 100644 index 00000000..93286a2c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_run_sse.h @@ -0,0 +1,328 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include "acl_run.h" +#include "acl_vect.h" + +enum { + SHUFFLE32_SLOT1 = 0xe5, + SHUFFLE32_SLOT2 = 0xe6, + SHUFFLE32_SLOT3 = 0xe7, + SHUFFLE32_SWAP64 = 0x4e, +}; + +static const rte_xmm_t xmm_shuffle_input = { + .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}, +}; + +static const rte_xmm_t xmm_ones_16 = { + .u16 = {1, 1, 1, 1, 1, 1, 1, 1}, +}; + +static const rte_xmm_t xmm_match_mask = { + .u32 = { + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + }, +}; + +static const rte_xmm_t xmm_index_mask = { + .u32 = { + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + }, +}; + +static const rte_xmm_t xmm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; + +/* + * Resolve priority for multiple results (sse version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx, + struct parms *parms, const struct rte_acl_match_results *p, + uint32_t categories) +{ + uint32_t x; + xmm_t results, priority, results1, priority1, selector; + xmm_t *saved_results, *saved_priority; + + for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { + + saved_results = (xmm_t *)(&parms[n].cmplt->results[x]); + saved_priority = + (xmm_t *)(&parms[n].cmplt->priority[x]); + + /* get results and priorities for completed trie */ + results = _mm_loadu_si128( + (const xmm_t *)&p[transition].results[x]); + priority = _mm_loadu_si128( + (const xmm_t *)&p[transition].priority[x]); + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + + /* get running best results and their priorities */ + results1 = _mm_loadu_si128(saved_results); + priority1 = _mm_loadu_si128(saved_priority); + + /* select results that are highest priority */ + selector = _mm_cmpgt_epi32(priority1, priority); + results = _mm_blendv_epi8(results, results1, selector); + priority = _mm_blendv_epi8(priority, priority1, + selector); + } + + /* save running best results and their priorities */ + _mm_storeu_si128(saved_results, results); + _mm_storeu_si128(saved_priority, priority); + } +} + +/* + * Extract transitions from an XMM register and check for any matches + */ +static void +acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx, + struct parms *parms, struct acl_flow_data *flows) +{ + uint64_t transition1, transition2; + + /* extract transition from low 64 bits. */ + transition1 = _mm_cvtsi128_si64(*indices); + + /* extract transition from high 64 bits. */ + *indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64); + transition2 = _mm_cvtsi128_si64(*indices); + + transition1 = acl_match_check(transition1, slot, ctx, + parms, flows, resolve_priority_sse); + transition2 = acl_match_check(transition2, slot + 1, ctx, + parms, flows, resolve_priority_sse); + + /* update indices with new transitions. */ + *indices = _mm_set_epi64x(transition2, transition1); +} + +/* + * Check for any match in 4 transitions (contained in 2 SSE registers) + */ +static __rte_always_inline void +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2, + xmm_t match_mask) +{ + xmm_t temp; + + /* put low 32 bits of each transition into one register */ + temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, + 0x88); + /* test for match node */ + temp = _mm_and_si128(match_mask, temp); + + while (!_mm_testz_si128(temp, temp)) { + acl_process_matches(indices1, slot, ctx, parms, flows); + acl_process_matches(indices2, slot + 2, ctx, parms, flows); + + temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, + (__m128)*indices2, + 0x88); + temp = _mm_and_si128(match_mask, temp); + } +} + +/* + * Process 4 transitions (in 2 XMM registers) in parallel + */ +static __rte_always_inline xmm_t +transition4(xmm_t next_input, const uint64_t *trans, + xmm_t *indices1, xmm_t *indices2) +{ + xmm_t addr, tr_lo, tr_hi; + uint64_t trans0, trans2; + + /* Shuffle low 32 into tr_lo and high 32 into tr_hi */ + ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi); + + /* Calculate the address (array index) for all 4 transitions. */ + ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input, + xmm_shuffle_input.x, xmm_ones_16.x, xmm_range_base.x, + tr_lo, tr_hi); + + /* Gather 64 bit transitions and pack back into 2 registers. */ + + trans0 = trans[_mm_cvtsi128_si32(addr)]; + + /* get slot 2 */ + + /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */ + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2); + trans2 = trans[_mm_cvtsi128_si32(addr)]; + + /* get slot 1 */ + + /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */ + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1); + *indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0); + + /* get slot 3 */ + + /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */ + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3); + *indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2); + + return _mm_srli_epi32(next_input, CHAR_BIT); +} + +/* + * Execute trie traversal with 8 traversals in parallel + */ +static inline int +search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_SSE8]; + struct completion cmplt[MAX_SEARCHES_SSE8]; + struct parms parms[MAX_SEARCHES_SSE8]; + xmm_t input0, input1; + xmm_t indices1, indices2, indices3, indices4; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_SSE8; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* + * indices1 contains index_array[0,1] + * indices2 contains index_array[2,3] + * indices3 contains index_array[4,5] + * indices4 contains index_array[6,7] + */ + + indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]); + indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]); + + indices3 = _mm_loadu_si128((xmm_t *) &index_array[4]); + indices4 = _mm_loadu_si128((xmm_t *) &index_array[6]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + acl_match_check_x4(4, ctx, parms, &flows, + &indices3, &indices4, xmm_match_mask.x); + + while (flows.started > 0) { + + /* Gather 4 bytes of input data for each stream. */ + input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); + input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4)); + + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1); + + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2); + + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3); + + /* Process the 4 bytes of input on each stream. */ + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + acl_match_check_x4(4, ctx, parms, &flows, + &indices3, &indices4, xmm_match_mask.x); + } + + return 0; +} + +/* + * Execute trie traversal with 4 traversals in parallel + */ +static inline int +search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, int total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_SSE4]; + struct completion cmplt[MAX_SEARCHES_SSE4]; + struct parms parms[MAX_SEARCHES_SSE4]; + xmm_t input, indices1, indices2; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_SSE4; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]); + indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + + while (flows.started > 0) { + + /* Gather 4 bytes of input data for each stream. */ + input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3); + + /* Process the 4 bytes of input on each stream. */ + input = transition4(input, flows.trans, &indices1, &indices2); + input = transition4(input, flows.trans, &indices1, &indices2); + input = transition4(input, flows.trans, &indices1, &indices2); + input = transition4(input, flows.trans, &indices1, &indices2); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/acl_vect.h b/src/spdk/dpdk/lib/librte_acl/acl_vect.h new file mode 100644 index 00000000..194fca90 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/acl_vect.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ACL_VECT_H_ +#define _RTE_ACL_VECT_H_ + +/** + * @file + * + * RTE ACL SSE/AVX related header. + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Takes 2 SIMD registers containing N transitions eachi (tr0, tr1). + * Shuffles it into different representation: + * lo - contains low 32 bits of given N transitions. + * hi - contains high 32 bits of given N transitions. + */ +#define ACL_TR_HILO(P, TC, tr0, tr1, lo, hi) do { \ + lo = (typeof(lo))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0x88); \ + hi = (typeof(hi))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0xdd); \ +} while (0) + + +/* + * Calculate the address of the next transition for + * all types of nodes. Note that only DFA nodes and range + * nodes actually transition to another node. Match + * nodes not supposed to be encountered here. + * For quad range nodes: + * Calculate number of range boundaries that are less than the + * input value. Range boundaries for each node are in signed 8 bit, + * ordered from -128 to 127. + * This is effectively a popcnt of bytes that are greater than the + * input byte. + * Single nodes are processed in the same ways as quad range nodes. +*/ +#define ACL_TR_CALC_ADDR(P, S, \ + addr, index_mask, next_input, shuffle_input, \ + ones_16, range_base, tr_lo, tr_hi) do { \ + \ + typeof(addr) in, node_type, r, t; \ + typeof(addr) dfa_msk, dfa_ofs, quad_ofs; \ + \ + t = _##P##_xor_si##S(index_mask, index_mask); \ + in = _##P##_shuffle_epi8(next_input, shuffle_input); \ + \ + /* Calc node type and node addr */ \ + node_type = _##P##_andnot_si##S(index_mask, tr_lo); \ + addr = _##P##_and_si##S(index_mask, tr_lo); \ + \ + /* mask for DFA type(0) nodes */ \ + dfa_msk = _##P##_cmpeq_epi32(node_type, t); \ + \ + /* DFA calculations. */ \ + r = _##P##_srli_epi32(in, 30); \ + r = _##P##_add_epi8(r, range_base); \ + t = _##P##_srli_epi32(in, 24); \ + r = _##P##_shuffle_epi8(tr_hi, r); \ + \ + dfa_ofs = _##P##_sub_epi32(t, r); \ + \ + /* QUAD/SINGLE caluclations. */ \ + t = _##P##_cmpgt_epi8(in, tr_hi); \ + t = _##P##_sign_epi8(t, t); \ + t = _##P##_maddubs_epi16(t, t); \ + quad_ofs = _##P##_madd_epi16(t, ones_16); \ + \ + /* blend DFA and QUAD/SINGLE. */ \ + t = _##P##_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); \ + \ + /* calculate address for next transitions. */ \ + addr = _##P##_add_epi32(addr, t); \ +} while (0) + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ACL_VECT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_acl/meson.build b/src/spdk/dpdk/lib/librte_acl/meson.build new file mode 100644 index 00000000..aec792f5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/meson.build @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('acl_bld.c', 'acl_gen.c', 'acl_run_scalar.c', + 'rte_acl.c', 'tb_mem.c') +headers = files('rte_acl.h', 'rte_acl_osdep.h') + +if arch_subdir == 'x86' + sources += files('acl_run_sse.c') + + # compile AVX2 version if either: + # a. we have AVX supported in minimum instruction set baseline + # b. it's not minimum instruction set, but supported by compiler + # + # in former case, just add avx2 C file to files list + # in latter case, compile c file to static lib, using correct compiler + # flags, and then have the .o file from static lib linked into main lib. + if dpdk_conf.has('RTE_MACHINE_CPUFLAG_AVX2') + sources += files('acl_run_avx2.c') + cflags += '-DCC_AVX2_SUPPORT' + elif cc.has_argument('-mavx2') + avx2_tmplib = static_library('avx2_tmp', + 'acl_run_avx2.c', + dependencies: static_rte_eal, + c_args: '-mavx2') + objs += avx2_tmplib.extract_objects('acl_run_avx2.c') + cflags += '-DCC_AVX2_SUPPORT' + endif + +endif diff --git a/src/spdk/dpdk/lib/librte_acl/rte_acl.c b/src/spdk/dpdk/lib/librte_acl/rte_acl.c new file mode 100644 index 00000000..2f1243cd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/rte_acl.c @@ -0,0 +1,375 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include "acl.h" + +TAILQ_HEAD(rte_acl_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_acl_tailq = { + .name = "RTE_ACL", +}; +EAL_REGISTER_TAILQ(rte_acl_tailq) + +/* + * If the compiler doesn't support AVX2 instructions, + * then the dummy one would be used instead for AVX2 classify method. + */ +int __attribute__ ((weak)) +rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +int __attribute__ ((weak)) +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +int __attribute__ ((weak)) +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +int __attribute__ ((weak)) +rte_acl_classify_altivec(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +static const rte_acl_classify_t classify_fns[] = { + [RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar, + [RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar, + [RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse, + [RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2, + [RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon, + [RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec, +}; + +/* by default, use always available scalar code path. */ +static enum rte_acl_classify_alg rte_acl_default_classify = + RTE_ACL_CLASSIFY_SCALAR; + +static void +rte_acl_set_default_classify(enum rte_acl_classify_alg alg) +{ + rte_acl_default_classify = alg; +} + +extern int +rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, enum rte_acl_classify_alg alg) +{ + if (ctx == NULL || (uint32_t)alg >= RTE_DIM(classify_fns)) + return -EINVAL; + + ctx->alg = alg; + return 0; +} + +/* + * Select highest available classify method as default one. + * Note that CLASSIFY_AVX2 should be set as a default only + * if both conditions are met: + * at build time compiler supports AVX2 and target cpu supports AVX2. + */ +RTE_INIT(rte_acl_init) +{ + enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT; + +#if defined(RTE_ARCH_ARM64) + alg = RTE_ACL_CLASSIFY_NEON; +#elif defined(RTE_ARCH_ARM) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + alg = RTE_ACL_CLASSIFY_NEON; +#elif defined(RTE_ARCH_PPC_64) + alg = RTE_ACL_CLASSIFY_ALTIVEC; +#else +#ifdef CC_AVX2_SUPPORT + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) + alg = RTE_ACL_CLASSIFY_AVX2; + else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1)) +#else + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1)) +#endif + alg = RTE_ACL_CLASSIFY_SSE; + +#endif + rte_acl_set_default_classify(alg); +} + +int +rte_acl_classify_alg(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories, + enum rte_acl_classify_alg alg) +{ + if (categories != 1 && + ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0) + return -EINVAL; + + return classify_fns[alg](ctx, data, results, num, categories); +} + +int +rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + return rte_acl_classify_alg(ctx, data, results, num, categories, + ctx->alg); +} + +struct rte_acl_ctx * +rte_acl_find_existing(const char *name) +{ + struct rte_acl_ctx *ctx = NULL; + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, acl_list, next) { + ctx = (struct rte_acl_ctx *) te->data; + if (strncmp(name, ctx->name, sizeof(ctx->name)) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + return ctx; +} + +void +rte_acl_free(struct rte_acl_ctx *ctx) +{ + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + + if (ctx == NULL) + return; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find our tailq entry */ + TAILQ_FOREACH(te, acl_list, next) { + if (te->data == (void *) ctx) + break; + } + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(acl_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(ctx->mem); + rte_free(ctx); + rte_free(te); +} + +struct rte_acl_ctx * +rte_acl_create(const struct rte_acl_param *param) +{ + size_t sz; + struct rte_acl_ctx *ctx; + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + char name[sizeof(ctx->name)]; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + /* check that input parameters are valid. */ + if (param == NULL || param->name == NULL) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(name, sizeof(name), "ACL_%s", param->name); + + /* calculate amount of memory required for pattern set. */ + sz = sizeof(*ctx) + param->max_rule_num * param->rule_size; + + /* get EAL TAILQ lock. */ + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* if we already have one with that name */ + TAILQ_FOREACH(te, acl_list, next) { + ctx = (struct rte_acl_ctx *) te->data; + if (strncmp(param->name, ctx->name, sizeof(ctx->name)) == 0) + break; + } + + /* if ACL with such name doesn't exist, then create a new one. */ + if (te == NULL) { + ctx = NULL; + te = rte_zmalloc("ACL_TAILQ_ENTRY", sizeof(*te), 0); + + if (te == NULL) { + RTE_LOG(ERR, ACL, "Cannot allocate tailq entry!\n"); + goto exit; + } + + ctx = rte_zmalloc_socket(name, sz, RTE_CACHE_LINE_SIZE, param->socket_id); + + if (ctx == NULL) { + RTE_LOG(ERR, ACL, + "allocation of %zu bytes on socket %d for %s failed\n", + sz, param->socket_id, name); + rte_free(te); + goto exit; + } + /* init new allocated context. */ + ctx->rules = ctx + 1; + ctx->max_rules = param->max_rule_num; + ctx->rule_sz = param->rule_size; + ctx->socket_id = param->socket_id; + ctx->alg = rte_acl_default_classify; + snprintf(ctx->name, sizeof(ctx->name), "%s", param->name); + + te->data = (void *) ctx; + + TAILQ_INSERT_TAIL(acl_list, te, next); + } + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return ctx; +} + +static int +acl_add_rules(struct rte_acl_ctx *ctx, const void *rules, uint32_t num) +{ + uint8_t *pos; + + if (num + ctx->num_rules > ctx->max_rules) + return -ENOMEM; + + pos = ctx->rules; + pos += ctx->rule_sz * ctx->num_rules; + memcpy(pos, rules, num * ctx->rule_sz); + ctx->num_rules += num; + + return 0; +} + +static int +acl_check_rule(const struct rte_acl_rule_data *rd) +{ + if ((RTE_LEN2MASK(RTE_ACL_MAX_CATEGORIES, typeof(rd->category_mask)) & + rd->category_mask) == 0 || + rd->priority > RTE_ACL_MAX_PRIORITY || + rd->priority < RTE_ACL_MIN_PRIORITY) + return -EINVAL; + return 0; +} + +int +rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules, + uint32_t num) +{ + const struct rte_acl_rule *rv; + uint32_t i; + int32_t rc; + + if (ctx == NULL || rules == NULL || 0 == ctx->rule_sz) + return -EINVAL; + + for (i = 0; i != num; i++) { + rv = (const struct rte_acl_rule *) + ((uintptr_t)rules + i * ctx->rule_sz); + rc = acl_check_rule(&rv->data); + if (rc != 0) { + RTE_LOG(ERR, ACL, "%s(%s): rule #%u is invalid\n", + __func__, ctx->name, i + 1); + return rc; + } + } + + return acl_add_rules(ctx, rules, num); +} + +/* + * Reset all rules. + * Note that RT structures are not affected. + */ +void +rte_acl_reset_rules(struct rte_acl_ctx *ctx) +{ + if (ctx != NULL) + ctx->num_rules = 0; +} + +/* + * Reset all rules and destroys RT structures. + */ +void +rte_acl_reset(struct rte_acl_ctx *ctx) +{ + if (ctx != NULL) { + rte_acl_reset_rules(ctx); + rte_acl_build(ctx, &ctx->config); + } +} + +/* + * Dump ACL context to the stdout. + */ +void +rte_acl_dump(const struct rte_acl_ctx *ctx) +{ + if (!ctx) + return; + printf("acl context <%s>@%p\n", ctx->name, ctx); + printf(" socket_id=%"PRId32"\n", ctx->socket_id); + printf(" alg=%"PRId32"\n", ctx->alg); + printf(" max_rules=%"PRIu32"\n", ctx->max_rules); + printf(" rule_size=%"PRIu32"\n", ctx->rule_sz); + printf(" num_rules=%"PRIu32"\n", ctx->num_rules); + printf(" num_categories=%"PRIu32"\n", ctx->num_categories); + printf(" num_tries=%"PRIu32"\n", ctx->num_tries); +} + +/* + * Dump all ACL contexts to the stdout. + */ +void +rte_acl_list_dump(void) +{ + struct rte_acl_ctx *ctx; + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, acl_list, next) { + ctx = (struct rte_acl_ctx *) te->data; + rte_acl_dump(ctx); + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); +} diff --git a/src/spdk/dpdk/lib/librte_acl/rte_acl.h b/src/spdk/dpdk/lib/librte_acl/rte_acl.h new file mode 100644 index 00000000..34c3b9c6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/rte_acl.h @@ -0,0 +1,358 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ACL_H_ +#define _RTE_ACL_H_ + +/** + * @file + * + * RTE Classifier. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_ACL_MAX_CATEGORIES 16 + +#define RTE_ACL_RESULTS_MULTIPLIER (XMM_SIZE / sizeof(uint32_t)) + +#define RTE_ACL_MAX_LEVELS 64 +#define RTE_ACL_MAX_FIELDS 64 + +union rte_acl_field_types { + uint8_t u8; + uint16_t u16; + uint32_t u32; + uint64_t u64; +}; + +enum { + RTE_ACL_FIELD_TYPE_MASK = 0, + RTE_ACL_FIELD_TYPE_RANGE, + RTE_ACL_FIELD_TYPE_BITMASK +}; + +/** + * ACL Field definition. + * Each field in the ACL rule has an associate definition. + * It defines the type of field, its size, its offset in the input buffer, + * the field index, and the input index. + * For performance reasons, the inner loop of the search function is unrolled + * to process four input bytes at a time. This requires the input to be grouped + * into sets of 4 consecutive bytes. The loop processes the first input byte as + * part of the setup and then subsequent bytes must be in groups of 4 + * consecutive bytes. + */ +struct rte_acl_field_def { + uint8_t type; /**< type - RTE_ACL_FIELD_TYPE_*. */ + uint8_t size; /**< size of field 1,2,4, or 8. */ + uint8_t field_index; /**< index of field inside the rule. */ + uint8_t input_index; /**< 0-N input index. */ + uint32_t offset; /**< offset to start of field. */ +}; + +/** + * ACL build configuration. + * Defines the fields of an ACL trie and number of categories to build with. + */ +struct rte_acl_config { + uint32_t num_categories; /**< Number of categories to build with. */ + uint32_t num_fields; /**< Number of field definitions. */ + struct rte_acl_field_def defs[RTE_ACL_MAX_FIELDS]; + /**< array of field definitions. */ + size_t max_size; + /**< max memory limit for internal run-time structures. */ +}; + +/** + * Defines the value of a field for a rule. + */ +struct rte_acl_field { + union rte_acl_field_types value; + /**< a 1,2,4, or 8 byte value of the field. */ + union rte_acl_field_types mask_range; + /**< + * depending on field type: + * mask -> 1.2.3.4/32 value=0x1020304, mask_range=32, + * range -> 0 : 65535 value=0, mask_range=65535, + * bitmask -> 0x06/0xff value=6, mask_range=0xff. + */ +}; + +enum { + RTE_ACL_TYPE_SHIFT = 29, + RTE_ACL_MAX_INDEX = RTE_LEN2MASK(RTE_ACL_TYPE_SHIFT, uint32_t), + RTE_ACL_MAX_PRIORITY = RTE_ACL_MAX_INDEX, + RTE_ACL_MIN_PRIORITY = 0, +}; + +#define RTE_ACL_MASKLEN_TO_BITMASK(v, s) \ +((v) == 0 ? (v) : (typeof(v))((uint64_t)-1 << ((s) * CHAR_BIT - (v)))) + +/** + * Miscellaneous data for ACL rule. + */ +struct rte_acl_rule_data { + uint32_t category_mask; /**< Mask of categories for that rule. */ + int32_t priority; /**< Priority for that rule. */ + uint32_t userdata; /**< Associated with the rule user data. */ +}; + +/** + * Defines single ACL rule. + * data - miscellaneous data for the rule. + * field[] - value and mask or range for each field. + */ +#define RTE_ACL_RULE_DEF(name, fld_num) struct name {\ + struct rte_acl_rule_data data; \ + struct rte_acl_field field[fld_num]; \ +} + +RTE_ACL_RULE_DEF(rte_acl_rule,); + +#define RTE_ACL_RULE_SZ(fld_num) \ + (sizeof(struct rte_acl_rule) + sizeof(struct rte_acl_field) * (fld_num)) + + +/** Max number of characters in name.*/ +#define RTE_ACL_NAMESIZE 32 + +/** + * Parameters used when creating the ACL context. + */ +struct rte_acl_param { + const char *name; /**< Name of the ACL context. */ + int socket_id; /**< Socket ID to allocate memory for. */ + uint32_t rule_size; /**< Size of each rule. */ + uint32_t max_rule_num; /**< Maximum number of rules. */ +}; + + +/** + * Create a new ACL context. + * + * @param param + * Parameters used to create and initialise the ACL context. + * @return + * Pointer to ACL context structure that is used in future ACL + * operations, or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + */ +struct rte_acl_ctx * +rte_acl_create(const struct rte_acl_param *param); + +/** + * Find an existing ACL context object and return a pointer to it. + * + * @param name + * Name of the ACL context as passed to rte_acl_create() + * @return + * Pointer to ACL context or NULL if object not found + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - value not available for return + */ +struct rte_acl_ctx * +rte_acl_find_existing(const char *name); + +/** + * De-allocate all memory used by ACL context. + * + * @param ctx + * ACL context to free + */ +void +rte_acl_free(struct rte_acl_ctx *ctx); + +/** + * Add rules to an existing ACL context. + * This function is not multi-thread safe. + * + * @param ctx + * ACL context to add patterns to. + * @param rules + * Array of rules to add to the ACL context. + * Note that all fields in rte_acl_rule structures are expected + * to be in host byte order. + * Each rule expected to be in the same format and not exceed size + * specified at ACL context creation time. + * @param num + * Number of elements in the input array of rules. + * @return + * - -ENOMEM if there is no space in the ACL context for these rules. + * - -EINVAL if the parameters are invalid. + * - Zero if operation completed successfully. + */ +int +rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules, + uint32_t num); + +/** + * Delete all rules from the ACL context. + * This function is not multi-thread safe. + * Note that internal run-time structures are not affected. + * + * @param ctx + * ACL context to delete rules from. + */ +void +rte_acl_reset_rules(struct rte_acl_ctx *ctx); + +/** + * Analyze set of rules and build required internal run-time structures. + * This function is not multi-thread safe. + * + * @param ctx + * ACL context to build. + * @param cfg + * Pointer to struct rte_acl_config - defines build parameters. + * @return + * - -ENOMEM if couldn't allocate enough memory. + * - -EINVAL if the parameters are invalid. + * - Negative error code if operation failed. + * - Zero if operation completed successfully. + */ +int +rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg); + +/** + * Delete all rules from the ACL context and + * destroy all internal run-time structures. + * This function is not multi-thread safe. + * + * @param ctx + * ACL context to reset. + */ +void +rte_acl_reset(struct rte_acl_ctx *ctx); + +/** + * Available implementations of ACL classify. + */ +enum rte_acl_classify_alg { + RTE_ACL_CLASSIFY_DEFAULT = 0, + RTE_ACL_CLASSIFY_SCALAR = 1, /**< generic implementation. */ + RTE_ACL_CLASSIFY_SSE = 2, /**< requires SSE4.1 support. */ + RTE_ACL_CLASSIFY_AVX2 = 3, /**< requires AVX2 support. */ + RTE_ACL_CLASSIFY_NEON = 4, /**< requires NEON support. */ + RTE_ACL_CLASSIFY_ALTIVEC = 5, /**< requires ALTIVEC support. */ + RTE_ACL_CLASSIFY_NUM /* should always be the last one. */ +}; + +/** + * Perform search for a matching ACL rule for each input data buffer. + * Each input data buffer can have up to *categories* matches. + * That implies that results array should be big enough to hold + * (categories * num) elements. + * Also categories parameter should be either one or multiple of + * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES. + * If more than one rule is applicable for given input buffer and + * given category, then rule with highest priority will be returned as a match. + * Note, that it is a caller's responsibility to ensure that input parameters + * are valid and point to correct memory locations. + * + * @param ctx + * ACL context to search with. + * @param data + * Array of pointers to input data buffers to perform search. + * Note that all fields in input data buffers supposed to be in network + * byte order (MSB). + * @param results + * Array of search results, *categories* results per each input data buffer. + * @param num + * Number of elements in the input data buffers array. + * @param categories + * Number of maximum possible matches for each input buffer, one possible + * match per category. + * @return + * zero on successful completion. + * -EINVAL for incorrect arguments. + */ +extern int +rte_acl_classify(const struct rte_acl_ctx *ctx, + const uint8_t **data, + uint32_t *results, uint32_t num, + uint32_t categories); + +/** + * Perform search using specified algorithm for a matching ACL rule for + * each input data buffer. + * Each input data buffer can have up to *categories* matches. + * That implies that results array should be big enough to hold + * (categories * num) elements. + * Also categories parameter should be either one or multiple of + * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES. + * If more than one rule is applicable for given input buffer and + * given category, then rule with highest priority will be returned as a match. + * Note, that it is a caller's responsibility to ensure that input parameters + * are valid and point to correct memory locations. + * + * @param ctx + * ACL context to search with. + * @param data + * Array of pointers to input data buffers to perform search. + * Note that all fields in input data buffers supposed to be in network + * byte order (MSB). + * @param results + * Array of search results, *categories* results per each input data buffer. + * @param num + * Number of elements in the input data buffers array. + * @param categories + * Number of maximum possible matches for each input buffer, one possible + * match per category. + * @param alg + * Algorithm to be used for the search. + * It is the caller responsibility to ensure that the value refers to the + * existing algorithm, and that it could be run on the given CPU. + * @return + * zero on successful completion. + * -EINVAL for incorrect arguments. + */ +extern int +rte_acl_classify_alg(const struct rte_acl_ctx *ctx, + const uint8_t **data, + uint32_t *results, uint32_t num, + uint32_t categories, + enum rte_acl_classify_alg alg); + +/* + * Override the default classifier function for a given ACL context. + * @param ctx + * ACL context to change classify function for. + * @param alg + * New default classify algorithm for given ACL context. + * It is the caller responsibility to ensure that the value refers to the + * existing algorithm, and that it could be run on the given CPU. + * @return + * - -EINVAL if the parameters are invalid. + * - Zero if operation completed successfully. + */ +extern int +rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, + enum rte_acl_classify_alg alg); + +/** + * Dump an ACL context structure to the console. + * + * @param ctx + * ACL context to dump. + */ +void +rte_acl_dump(const struct rte_acl_ctx *ctx); + +/** + * Dump all ACL context structures to the console. + */ +void +rte_acl_list_dump(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ACL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_acl/rte_acl_osdep.h b/src/spdk/dpdk/lib/librte_acl/rte_acl_osdep.h new file mode 100644 index 00000000..68c16960 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/rte_acl_osdep.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ACL_OSDEP_H_ +#define _RTE_ACL_OSDEP_H_ + +/** + * @file + * + * RTE ACL DPDK/OS dependent file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Common defines. + */ + +#define DIM(x) RTE_DIM(x) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif /* _RTE_ACL_OSDEP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_acl/rte_acl_version.map b/src/spdk/dpdk/lib/librte_acl/rte_acl_version.map new file mode 100644 index 00000000..b09370a1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/rte_acl_version.map @@ -0,0 +1,19 @@ +DPDK_2.0 { + global: + + rte_acl_add_rules; + rte_acl_build; + rte_acl_classify; + rte_acl_classify_alg; + rte_acl_classify_scalar; + rte_acl_create; + rte_acl_dump; + rte_acl_find_existing; + rte_acl_free; + rte_acl_list_dump; + rte_acl_reset; + rte_acl_reset_rules; + rte_acl_set_ctx_classify; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_acl/tb_mem.c b/src/spdk/dpdk/lib/librte_acl/tb_mem.c new file mode 100644 index 00000000..f14d7b4f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/tb_mem.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include "tb_mem.h" + +/* + * Memory management routines for temporary memory. + * That memory is used only during build phase and is released after + * build is finished. + * Note, that tb_pool/tb_alloc() are not supposed to return NULL. + * Instead, in the case of failure to allocate memory, + * it would do siglongjmp(pool->fail). + * It is responsibility of the caller to save the proper context/environment, + * in the pool->fail before calling tb_alloc() for the given pool first time. + */ + +static struct tb_mem_block * +tb_pool(struct tb_mem_pool *pool, size_t sz) +{ + struct tb_mem_block *block; + uint8_t *ptr; + size_t size; + + size = sz + pool->alignment - 1; + block = calloc(1, size + sizeof(*pool->block)); + if (block == NULL) { + RTE_LOG(ERR, MALLOC, "%s(%zu)\n failed, currently allocated " + "by pool: %zu bytes\n", __func__, sz, pool->alloc); + siglongjmp(pool->fail, -ENOMEM); + return NULL; + } + + block->pool = pool; + + block->next = pool->block; + pool->block = block; + + pool->alloc += size; + + ptr = (uint8_t *)(block + 1); + block->mem = RTE_PTR_ALIGN_CEIL(ptr, pool->alignment); + block->size = size - (block->mem - ptr); + + return block; +} + +void * +tb_alloc(struct tb_mem_pool *pool, size_t size) +{ + struct tb_mem_block *block; + void *ptr; + size_t new_sz; + + size = RTE_ALIGN_CEIL(size, pool->alignment); + + block = pool->block; + if (block == NULL || block->size < size) { + new_sz = (size > pool->min_alloc) ? size : pool->min_alloc; + block = tb_pool(pool, new_sz); + } + ptr = block->mem; + block->size -= size; + block->mem += size; + return ptr; +} + +void +tb_free_pool(struct tb_mem_pool *pool) +{ + struct tb_mem_block *next, *block; + + for (block = pool->block; block != NULL; block = next) { + next = block->next; + free(block); + } + pool->block = NULL; + pool->alloc = 0; +} diff --git a/src/spdk/dpdk/lib/librte_acl/tb_mem.h b/src/spdk/dpdk/lib/librte_acl/tb_mem.h new file mode 100644 index 00000000..50a803c7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_acl/tb_mem.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _TB_MEM_H_ +#define _TB_MEM_H_ + +/** + * @file + * + * RTE ACL temporary (build phase) memory management. + * Contains structures and functions to manage temporary (used by build only) + * memory. Memory allocated in large blocks to speed 'free' when trie is + * destructed (finish of build phase). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +struct tb_mem_block { + struct tb_mem_block *next; + struct tb_mem_pool *pool; + size_t size; + uint8_t *mem; +}; + +struct tb_mem_pool { + struct tb_mem_block *block; + size_t alignment; + size_t min_alloc; + size_t alloc; + /* jump target in case of memory allocation failure. */ + sigjmp_buf fail; +}; + +void *tb_alloc(struct tb_mem_pool *pool, size_t size); +void tb_free_pool(struct tb_mem_pool *pool); + +#ifdef __cplusplus +} +#endif + +#endif /* _TB_MEM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bbdev/Makefile b/src/spdk/dpdk/lib/librte_bbdev/Makefile new file mode 100644 index 00000000..1451adb2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/Makefile @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_bbdev.a + +# library version +LIBABIVER := 1 + +# build flags +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf + +# library source files +SRCS-y += rte_bbdev.c + +# export include files +SYMLINK-y-include += rte_bbdev_op.h +SYMLINK-y-include += rte_bbdev.h +SYMLINK-y-include += rte_bbdev_pmd.h + +# versioning export map +EXPORT_MAP := rte_bbdev_version.map + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_bbdev/meson.build b/src/spdk/dpdk/lib/librte_bbdev/meson.build new file mode 100644 index 00000000..f6ca0ad7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +allow_experimental_apis = true +sources = files('rte_bbdev.c') +headers = files('rte_bbdev.h', + 'rte_bbdev_pmd.h', + 'rte_bbdev_op.h') +deps += ['mbuf'] diff --git a/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.c b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.c new file mode 100644 index 00000000..c4cc18d9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.c @@ -0,0 +1,1133 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_bbdev_op.h" +#include "rte_bbdev.h" +#include "rte_bbdev_pmd.h" + +#define DEV_NAME "BBDEV" + + +/* BBDev library logging ID */ +static int bbdev_logtype; + +/* Helper macro for logging */ +#define rte_bbdev_log(level, fmt, ...) \ + rte_log(RTE_LOG_ ## level, bbdev_logtype, fmt "\n", ##__VA_ARGS__) + +#define rte_bbdev_log_debug(fmt, ...) \ + rte_bbdev_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \ + ##__VA_ARGS__) + +/* Helper macro to check dev_id is valid */ +#define VALID_DEV_OR_RET_ERR(dev, dev_id) do { \ + if (dev == NULL) { \ + rte_bbdev_log(ERR, "device %u is invalid", dev_id); \ + return -ENODEV; \ + } \ +} while (0) + +/* Helper macro to check dev_ops is valid */ +#define VALID_DEV_OPS_OR_RET_ERR(dev, dev_id) do { \ + if (dev->dev_ops == NULL) { \ + rte_bbdev_log(ERR, "NULL dev_ops structure in device %u", \ + dev_id); \ + return -ENODEV; \ + } \ +} while (0) + +/* Helper macro to check that driver implements required function pointer */ +#define VALID_FUNC_OR_RET_ERR(func, dev_id) do { \ + if (func == NULL) { \ + rte_bbdev_log(ERR, "device %u does not support %s", \ + dev_id, #func); \ + return -ENOTSUP; \ + } \ +} while (0) + +/* Helper macro to check that queue is valid */ +#define VALID_QUEUE_OR_RET_ERR(queue_id, dev) do { \ + if (queue_id >= dev->data->num_queues) { \ + rte_bbdev_log(ERR, "Invalid queue_id %u for device %u", \ + queue_id, dev->data->dev_id); \ + return -ERANGE; \ + } \ +} while (0) + +/* List of callback functions registered by an application */ +struct rte_bbdev_callback { + TAILQ_ENTRY(rte_bbdev_callback) next; /* Callbacks list */ + rte_bbdev_cb_fn cb_fn; /* Callback address */ + void *cb_arg; /* Parameter for callback */ + void *ret_param; /* Return parameter */ + enum rte_bbdev_event_type event; /* Interrupt event type */ + uint32_t active; /* Callback is executing */ +}; + +/* spinlock for bbdev device callbacks */ +static rte_spinlock_t rte_bbdev_cb_lock = RTE_SPINLOCK_INITIALIZER; + +/* + * Global array of all devices. This is not static because it's used by the + * inline enqueue and dequeue functions + */ +struct rte_bbdev rte_bbdev_devices[RTE_BBDEV_MAX_DEVS]; + +/* Global array with rte_bbdev_data structures */ +static struct rte_bbdev_data *rte_bbdev_data; + +/* Memzone name for global bbdev data pool */ +static const char *MZ_RTE_BBDEV_DATA = "rte_bbdev_data"; + +/* Number of currently valid devices */ +static uint16_t num_devs; + +/* Return pointer to device structure, with validity check */ +static struct rte_bbdev * +get_dev(uint16_t dev_id) +{ + if (rte_bbdev_is_valid(dev_id)) + return &rte_bbdev_devices[dev_id]; + return NULL; +} + +/* Allocate global data array */ +static int +rte_bbdev_data_alloc(void) +{ + const unsigned int flags = 0; + const struct rte_memzone *mz; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + mz = rte_memzone_reserve(MZ_RTE_BBDEV_DATA, + RTE_BBDEV_MAX_DEVS * sizeof(*rte_bbdev_data), + rte_socket_id(), flags); + } else + mz = rte_memzone_lookup(MZ_RTE_BBDEV_DATA); + if (mz == NULL) { + rte_bbdev_log(CRIT, + "Cannot allocate memzone for bbdev port data"); + return -ENOMEM; + } + + rte_bbdev_data = mz->addr; + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + memset(rte_bbdev_data, 0, + RTE_BBDEV_MAX_DEVS * sizeof(*rte_bbdev_data)); + return 0; +} + +/* + * Find data alocated for the device or if not found return first unused bbdev + * data. If all structures are in use and none is used by the device return + * NULL. + */ +static struct rte_bbdev_data * +find_bbdev_data(const char *name) +{ + uint16_t data_id; + + for (data_id = 0; data_id < RTE_BBDEV_MAX_DEVS; ++data_id) { + if (strlen(rte_bbdev_data[data_id].name) == 0) { + memset(&rte_bbdev_data[data_id], 0, + sizeof(struct rte_bbdev_data)); + return &rte_bbdev_data[data_id]; + } else if (strncmp(rte_bbdev_data[data_id].name, name, + RTE_BBDEV_NAME_MAX_LEN) == 0) + return &rte_bbdev_data[data_id]; + } + + return NULL; +} + +/* Find lowest device id with no attached device */ +static uint16_t +find_free_dev_id(void) +{ + uint16_t i; + for (i = 0; i < RTE_BBDEV_MAX_DEVS; i++) { + if (rte_bbdev_devices[i].state == RTE_BBDEV_UNUSED) + return i; + } + return RTE_BBDEV_MAX_DEVS; +} + +struct rte_bbdev * __rte_experimental +rte_bbdev_allocate(const char *name) +{ + int ret; + struct rte_bbdev *bbdev; + uint16_t dev_id; + + if (name == NULL) { + rte_bbdev_log(ERR, "Invalid null device name"); + return NULL; + } + + if (rte_bbdev_get_named_dev(name) != NULL) { + rte_bbdev_log(ERR, "Device \"%s\" is already allocated", name); + return NULL; + } + + dev_id = find_free_dev_id(); + if (dev_id == RTE_BBDEV_MAX_DEVS) { + rte_bbdev_log(ERR, "Reached maximum number of devices"); + return NULL; + } + + bbdev = &rte_bbdev_devices[dev_id]; + + if (rte_bbdev_data == NULL) { + ret = rte_bbdev_data_alloc(); + if (ret != 0) + return NULL; + } + + bbdev->data = find_bbdev_data(name); + if (bbdev->data == NULL) { + rte_bbdev_log(ERR, + "Max BBDevs already allocated in multi-process environment!"); + return NULL; + } + + rte_atomic16_inc(&bbdev->data->process_cnt); + bbdev->data->dev_id = dev_id; + bbdev->state = RTE_BBDEV_INITIALIZED; + + ret = snprintf(bbdev->data->name, RTE_BBDEV_NAME_MAX_LEN, "%s", name); + if ((ret < 0) || (ret >= RTE_BBDEV_NAME_MAX_LEN)) { + rte_bbdev_log(ERR, "Copying device name \"%s\" failed", name); + return NULL; + } + + /* init user callbacks */ + TAILQ_INIT(&(bbdev->list_cbs)); + + num_devs++; + + rte_bbdev_log_debug("Initialised device %s (id = %u). Num devices = %u", + name, dev_id, num_devs); + + return bbdev; +} + +int __rte_experimental +rte_bbdev_release(struct rte_bbdev *bbdev) +{ + uint16_t dev_id; + struct rte_bbdev_callback *cb, *next; + + if (bbdev == NULL) { + rte_bbdev_log(ERR, "NULL bbdev"); + return -ENODEV; + } + dev_id = bbdev->data->dev_id; + + /* free all callbacks from the device's list */ + for (cb = TAILQ_FIRST(&bbdev->list_cbs); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + TAILQ_REMOVE(&(bbdev->list_cbs), cb, next); + rte_free(cb); + } + + /* clear shared BBDev Data if no process is using the device anymore */ + if (rte_atomic16_dec_and_test(&bbdev->data->process_cnt)) + memset(bbdev->data, 0, sizeof(*bbdev->data)); + + memset(bbdev, 0, sizeof(*bbdev)); + num_devs--; + bbdev->state = RTE_BBDEV_UNUSED; + + rte_bbdev_log_debug( + "Un-initialised device id = %u. Num devices = %u", + dev_id, num_devs); + return 0; +} + +struct rte_bbdev * __rte_experimental +rte_bbdev_get_named_dev(const char *name) +{ + unsigned int i; + + if (name == NULL) { + rte_bbdev_log(ERR, "NULL driver name"); + return NULL; + } + + for (i = 0; i < RTE_BBDEV_MAX_DEVS; i++) { + struct rte_bbdev *dev = get_dev(i); + if (dev && (strncmp(dev->data->name, + name, RTE_BBDEV_NAME_MAX_LEN) == 0)) + return dev; + } + + return NULL; +} + +uint16_t __rte_experimental +rte_bbdev_count(void) +{ + return num_devs; +} + +bool __rte_experimental +rte_bbdev_is_valid(uint16_t dev_id) +{ + if ((dev_id < RTE_BBDEV_MAX_DEVS) && + rte_bbdev_devices[dev_id].state == RTE_BBDEV_INITIALIZED) + return true; + return false; +} + +uint16_t __rte_experimental +rte_bbdev_find_next(uint16_t dev_id) +{ + dev_id++; + for (; dev_id < RTE_BBDEV_MAX_DEVS; dev_id++) + if (rte_bbdev_is_valid(dev_id)) + break; + return dev_id; +} + +int __rte_experimental +rte_bbdev_setup_queues(uint16_t dev_id, uint16_t num_queues, int socket_id) +{ + unsigned int i; + int ret; + struct rte_bbdev_driver_info dev_info; + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (dev->data->started) { + rte_bbdev_log(ERR, + "Device %u cannot be configured when started", + dev_id); + return -EBUSY; + } + + /* Get device driver information to get max number of queues */ + VALID_FUNC_OR_RET_ERR(dev->dev_ops->info_get, dev_id); + memset(&dev_info, 0, sizeof(dev_info)); + dev->dev_ops->info_get(dev, &dev_info); + + if ((num_queues == 0) || (num_queues > dev_info.max_num_queues)) { + rte_bbdev_log(ERR, + "Device %u supports 0 < N <= %u queues, not %u", + dev_id, dev_info.max_num_queues, num_queues); + return -EINVAL; + } + + /* If re-configuration, get driver to free existing internal memory */ + if (dev->data->queues != NULL) { + VALID_FUNC_OR_RET_ERR(dev->dev_ops->queue_release, dev_id); + for (i = 0; i < dev->data->num_queues; i++) { + int ret = dev->dev_ops->queue_release(dev, i); + if (ret < 0) { + rte_bbdev_log(ERR, + "Device %u queue %u release failed", + dev_id, i); + return ret; + } + } + /* Call optional device close */ + if (dev->dev_ops->close) { + ret = dev->dev_ops->close(dev); + if (ret < 0) { + rte_bbdev_log(ERR, + "Device %u couldn't be closed", + dev_id); + return ret; + } + } + rte_free(dev->data->queues); + } + + /* Allocate queue pointers */ + dev->data->queues = rte_calloc_socket(DEV_NAME, num_queues, + sizeof(dev->data->queues[0]), RTE_CACHE_LINE_SIZE, + dev->data->socket_id); + if (dev->data->queues == NULL) { + rte_bbdev_log(ERR, + "calloc of %u queues for device %u on socket %i failed", + num_queues, dev_id, dev->data->socket_id); + return -ENOMEM; + } + + dev->data->num_queues = num_queues; + + /* Call optional device configuration */ + if (dev->dev_ops->setup_queues) { + ret = dev->dev_ops->setup_queues(dev, num_queues, socket_id); + if (ret < 0) { + rte_bbdev_log(ERR, + "Device %u memory configuration failed", + dev_id); + goto error; + } + } + + rte_bbdev_log_debug("Device %u set up with %u queues", dev_id, + num_queues); + return 0; + +error: + dev->data->num_queues = 0; + rte_free(dev->data->queues); + dev->data->queues = NULL; + return ret; +} + +int __rte_experimental +rte_bbdev_intr_enable(uint16_t dev_id) +{ + int ret; + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (dev->data->started) { + rte_bbdev_log(ERR, + "Device %u cannot be configured when started", + dev_id); + return -EBUSY; + } + + if (dev->dev_ops->intr_enable) { + ret = dev->dev_ops->intr_enable(dev); + if (ret < 0) { + rte_bbdev_log(ERR, + "Device %u interrupts configuration failed", + dev_id); + return ret; + } + rte_bbdev_log_debug("Enabled interrupts for dev %u", dev_id); + return 0; + } + + rte_bbdev_log(ERR, "Device %u doesn't support interrupts", dev_id); + return -ENOTSUP; +} + +int __rte_experimental +rte_bbdev_queue_configure(uint16_t dev_id, uint16_t queue_id, + const struct rte_bbdev_queue_conf *conf) +{ + int ret = 0; + struct rte_bbdev_driver_info dev_info; + struct rte_bbdev *dev = get_dev(dev_id); + const struct rte_bbdev_op_cap *p; + struct rte_bbdev_queue_conf *stored_conf; + const char *op_type_str; + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + + if (dev->data->queues[queue_id].started || dev->data->started) { + rte_bbdev_log(ERR, + "Queue %u of device %u cannot be configured when started", + queue_id, dev_id); + return -EBUSY; + } + + VALID_FUNC_OR_RET_ERR(dev->dev_ops->queue_release, dev_id); + VALID_FUNC_OR_RET_ERR(dev->dev_ops->queue_setup, dev_id); + + /* Get device driver information to verify config is valid */ + VALID_FUNC_OR_RET_ERR(dev->dev_ops->info_get, dev_id); + memset(&dev_info, 0, sizeof(dev_info)); + dev->dev_ops->info_get(dev, &dev_info); + + /* Check configuration is valid */ + if (conf != NULL) { + if ((conf->op_type == RTE_BBDEV_OP_NONE) && + (dev_info.capabilities[0].type == + RTE_BBDEV_OP_NONE)) { + ret = 1; + } else { + for (p = dev_info.capabilities; + p->type != RTE_BBDEV_OP_NONE; p++) { + if (conf->op_type == p->type) { + ret = 1; + break; + } + } + } + if (ret == 0) { + rte_bbdev_log(ERR, "Invalid operation type"); + return -EINVAL; + } + if (conf->queue_size > dev_info.queue_size_lim) { + rte_bbdev_log(ERR, + "Size (%u) of queue %u of device %u must be: <= %u", + conf->queue_size, queue_id, dev_id, + dev_info.queue_size_lim); + return -EINVAL; + } + if (!rte_is_power_of_2(conf->queue_size)) { + rte_bbdev_log(ERR, + "Size (%u) of queue %u of device %u must be a power of 2", + conf->queue_size, queue_id, dev_id); + return -EINVAL; + } + if (conf->op_type == RTE_BBDEV_OP_TURBO_DEC && + conf->priority > dev_info.max_ul_queue_priority) { + rte_bbdev_log(ERR, + "Priority (%u) of queue %u of bdev %u must be <= %u", + conf->priority, queue_id, dev_id, + dev_info.max_ul_queue_priority); + return -EINVAL; + } + if (conf->op_type == RTE_BBDEV_OP_TURBO_ENC && + conf->priority > dev_info.max_dl_queue_priority) { + rte_bbdev_log(ERR, + "Priority (%u) of queue %u of bdev %u must be <= %u", + conf->priority, queue_id, dev_id, + dev_info.max_dl_queue_priority); + return -EINVAL; + } + } + + /* Release existing queue (in case of queue reconfiguration) */ + if (dev->data->queues[queue_id].queue_private != NULL) { + ret = dev->dev_ops->queue_release(dev, queue_id); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u queue %u release failed", + dev_id, queue_id); + return ret; + } + } + + /* Get driver to setup the queue */ + ret = dev->dev_ops->queue_setup(dev, queue_id, (conf != NULL) ? + conf : &dev_info.default_queue_conf); + if (ret < 0) { + rte_bbdev_log(ERR, + "Device %u queue %u setup failed", dev_id, + queue_id); + return ret; + } + + /* Store configuration */ + stored_conf = &dev->data->queues[queue_id].conf; + memcpy(stored_conf, + (conf != NULL) ? conf : &dev_info.default_queue_conf, + sizeof(*stored_conf)); + + op_type_str = rte_bbdev_op_type_str(stored_conf->op_type); + if (op_type_str == NULL) + return -EINVAL; + + rte_bbdev_log_debug("Configured dev%uq%u (size=%u, type=%s, prio=%u)", + dev_id, queue_id, stored_conf->queue_size, op_type_str, + stored_conf->priority); + + return 0; +} + +int __rte_experimental +rte_bbdev_start(uint16_t dev_id) +{ + int i; + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (dev->data->started) { + rte_bbdev_log_debug("Device %u is already started", dev_id); + return 0; + } + + if (dev->dev_ops->start) { + int ret = dev->dev_ops->start(dev); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u start failed", dev_id); + return ret; + } + } + + /* Store new state */ + for (i = 0; i < dev->data->num_queues; i++) + if (!dev->data->queues[i].conf.deferred_start) + dev->data->queues[i].started = true; + dev->data->started = true; + + rte_bbdev_log_debug("Started device %u", dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_stop(uint16_t dev_id) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (!dev->data->started) { + rte_bbdev_log_debug("Device %u is already stopped", dev_id); + return 0; + } + + if (dev->dev_ops->stop) + dev->dev_ops->stop(dev); + dev->data->started = false; + + rte_bbdev_log_debug("Stopped device %u", dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_close(uint16_t dev_id) +{ + int ret; + uint16_t i; + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (dev->data->started) { + ret = rte_bbdev_stop(dev_id); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u stop failed", dev_id); + return ret; + } + } + + /* Free memory used by queues */ + for (i = 0; i < dev->data->num_queues; i++) { + ret = dev->dev_ops->queue_release(dev, i); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u queue %u release failed", + dev_id, i); + return ret; + } + } + rte_free(dev->data->queues); + + if (dev->dev_ops->close) { + ret = dev->dev_ops->close(dev); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u close failed", dev_id); + return ret; + } + } + + /* Clear configuration */ + dev->data->queues = NULL; + dev->data->num_queues = 0; + + rte_bbdev_log_debug("Closed device %u", dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_queue_start(uint16_t dev_id, uint16_t queue_id) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + + if (dev->data->queues[queue_id].started) { + rte_bbdev_log_debug("Queue %u of device %u already started", + queue_id, dev_id); + return 0; + } + + if (dev->dev_ops->queue_start) { + int ret = dev->dev_ops->queue_start(dev, queue_id); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u queue %u start failed", + dev_id, queue_id); + return ret; + } + } + dev->data->queues[queue_id].started = true; + + rte_bbdev_log_debug("Started queue %u of device %u", queue_id, dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_queue_stop(uint16_t dev_id, uint16_t queue_id) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + + if (!dev->data->queues[queue_id].started) { + rte_bbdev_log_debug("Queue %u of device %u already stopped", + queue_id, dev_id); + return 0; + } + + if (dev->dev_ops->queue_stop) { + int ret = dev->dev_ops->queue_stop(dev, queue_id); + if (ret < 0) { + rte_bbdev_log(ERR, "Device %u queue %u stop failed", + dev_id, queue_id); + return ret; + } + } + dev->data->queues[queue_id].started = false; + + rte_bbdev_log_debug("Stopped queue %u of device %u", queue_id, dev_id); + return 0; +} + +/* Get device statistics */ +static void +get_stats_from_queues(struct rte_bbdev *dev, struct rte_bbdev_stats *stats) +{ + unsigned int q_id; + for (q_id = 0; q_id < dev->data->num_queues; q_id++) { + struct rte_bbdev_stats *q_stats = + &dev->data->queues[q_id].queue_stats; + + stats->enqueued_count += q_stats->enqueued_count; + stats->dequeued_count += q_stats->dequeued_count; + stats->enqueue_err_count += q_stats->enqueue_err_count; + stats->dequeue_err_count += q_stats->dequeue_err_count; + } + rte_bbdev_log_debug("Got stats on %u", dev->data->dev_id); +} + +static void +reset_stats_in_queues(struct rte_bbdev *dev) +{ + unsigned int q_id; + for (q_id = 0; q_id < dev->data->num_queues; q_id++) { + struct rte_bbdev_stats *q_stats = + &dev->data->queues[q_id].queue_stats; + + memset(q_stats, 0, sizeof(*q_stats)); + } + rte_bbdev_log_debug("Reset stats on %u", dev->data->dev_id); +} + +int __rte_experimental +rte_bbdev_stats_get(uint16_t dev_id, struct rte_bbdev_stats *stats) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (stats == NULL) { + rte_bbdev_log(ERR, "NULL stats structure"); + return -EINVAL; + } + + memset(stats, 0, sizeof(*stats)); + if (dev->dev_ops->stats_get != NULL) + dev->dev_ops->stats_get(dev, stats); + else + get_stats_from_queues(dev, stats); + + rte_bbdev_log_debug("Retrieved stats of device %u", dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_stats_reset(uint16_t dev_id) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + + if (dev->dev_ops->stats_reset != NULL) + dev->dev_ops->stats_reset(dev); + else + reset_stats_in_queues(dev); + + rte_bbdev_log_debug("Reset stats of device %u", dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_info_get(uint16_t dev_id, struct rte_bbdev_info *dev_info) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_FUNC_OR_RET_ERR(dev->dev_ops->info_get, dev_id); + + if (dev_info == NULL) { + rte_bbdev_log(ERR, "NULL dev info structure"); + return -EINVAL; + } + + /* Copy data maintained by device interface layer */ + memset(dev_info, 0, sizeof(*dev_info)); + dev_info->dev_name = dev->data->name; + dev_info->num_queues = dev->data->num_queues; + dev_info->bus = rte_bus_find_by_device(dev->device); + dev_info->socket_id = dev->data->socket_id; + dev_info->started = dev->data->started; + + /* Copy data maintained by device driver layer */ + dev->dev_ops->info_get(dev, &dev_info->drv); + + rte_bbdev_log_debug("Retrieved info of device %u", dev_id); + return 0; +} + +int __rte_experimental +rte_bbdev_queue_info_get(uint16_t dev_id, uint16_t queue_id, + struct rte_bbdev_queue_info *queue_info) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + + if (queue_info == NULL) { + rte_bbdev_log(ERR, "NULL queue info structure"); + return -EINVAL; + } + + /* Copy data to output */ + memset(queue_info, 0, sizeof(*queue_info)); + queue_info->conf = dev->data->queues[queue_id].conf; + queue_info->started = dev->data->queues[queue_id].started; + + rte_bbdev_log_debug("Retrieved info of queue %u of device %u", + queue_id, dev_id); + return 0; +} + +/* Calculate size needed to store bbdev_op, depending on type */ +static unsigned int +get_bbdev_op_size(enum rte_bbdev_op_type type) +{ + unsigned int result = 0; + switch (type) { + case RTE_BBDEV_OP_NONE: + result = RTE_MAX(sizeof(struct rte_bbdev_dec_op), + sizeof(struct rte_bbdev_enc_op)); + break; + case RTE_BBDEV_OP_TURBO_DEC: + result = sizeof(struct rte_bbdev_dec_op); + break; + case RTE_BBDEV_OP_TURBO_ENC: + result = sizeof(struct rte_bbdev_enc_op); + break; + default: + break; + } + + return result; +} + +/* Initialise a bbdev_op structure */ +static void +bbdev_op_init(struct rte_mempool *mempool, void *arg, void *element, + __rte_unused unsigned int n) +{ + enum rte_bbdev_op_type type = *(enum rte_bbdev_op_type *)arg; + + if (type == RTE_BBDEV_OP_TURBO_DEC) { + struct rte_bbdev_dec_op *op = element; + memset(op, 0, mempool->elt_size); + op->mempool = mempool; + } else if (type == RTE_BBDEV_OP_TURBO_ENC) { + struct rte_bbdev_enc_op *op = element; + memset(op, 0, mempool->elt_size); + op->mempool = mempool; + } +} + +struct rte_mempool * __rte_experimental +rte_bbdev_op_pool_create(const char *name, enum rte_bbdev_op_type type, + unsigned int num_elements, unsigned int cache_size, + int socket_id) +{ + struct rte_bbdev_op_pool_private *priv; + struct rte_mempool *mp; + const char *op_type_str; + + if (name == NULL) { + rte_bbdev_log(ERR, "NULL name for op pool"); + return NULL; + } + + if (type >= RTE_BBDEV_OP_TYPE_COUNT) { + rte_bbdev_log(ERR, + "Invalid op type (%u), should be less than %u", + type, RTE_BBDEV_OP_TYPE_COUNT); + return NULL; + } + + mp = rte_mempool_create(name, num_elements, get_bbdev_op_size(type), + cache_size, sizeof(struct rte_bbdev_op_pool_private), + NULL, NULL, bbdev_op_init, &type, socket_id, 0); + if (mp == NULL) { + rte_bbdev_log(ERR, + "Failed to create op pool %s (num ops=%u, op size=%u) with error: %s", + name, num_elements, get_bbdev_op_size(type), + rte_strerror(rte_errno)); + return NULL; + } + + op_type_str = rte_bbdev_op_type_str(type); + if (op_type_str == NULL) + return NULL; + + rte_bbdev_log_debug( + "Op pool %s created for %u ops (type=%s, cache=%u, socket=%u, size=%u)", + name, num_elements, op_type_str, cache_size, socket_id, + get_bbdev_op_size(type)); + + priv = (struct rte_bbdev_op_pool_private *)rte_mempool_get_priv(mp); + priv->type = type; + + return mp; +} + +int __rte_experimental +rte_bbdev_callback_register(uint16_t dev_id, enum rte_bbdev_event_type event, + rte_bbdev_cb_fn cb_fn, void *cb_arg) +{ + struct rte_bbdev_callback *user_cb; + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + if (event >= RTE_BBDEV_EVENT_MAX) { + rte_bbdev_log(ERR, + "Invalid event type (%u), should be less than %u", + event, RTE_BBDEV_EVENT_MAX); + return -EINVAL; + } + + if (cb_fn == NULL) { + rte_bbdev_log(ERR, "NULL callback function"); + return -EINVAL; + } + + rte_spinlock_lock(&rte_bbdev_cb_lock); + + TAILQ_FOREACH(user_cb, &(dev->list_cbs), next) { + if (user_cb->cb_fn == cb_fn && + user_cb->cb_arg == cb_arg && + user_cb->event == event) + break; + } + + /* create a new callback. */ + if (user_cb == NULL) { + user_cb = rte_zmalloc("INTR_USER_CALLBACK", + sizeof(struct rte_bbdev_callback), 0); + if (user_cb != NULL) { + user_cb->cb_fn = cb_fn; + user_cb->cb_arg = cb_arg; + user_cb->event = event; + TAILQ_INSERT_TAIL(&(dev->list_cbs), user_cb, next); + } + } + + rte_spinlock_unlock(&rte_bbdev_cb_lock); + return (user_cb == NULL) ? -ENOMEM : 0; +} + +int __rte_experimental +rte_bbdev_callback_unregister(uint16_t dev_id, enum rte_bbdev_event_type event, + rte_bbdev_cb_fn cb_fn, void *cb_arg) +{ + int ret = 0; + struct rte_bbdev_callback *cb, *next; + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + + if (event >= RTE_BBDEV_EVENT_MAX) { + rte_bbdev_log(ERR, + "Invalid event type (%u), should be less than %u", + event, RTE_BBDEV_EVENT_MAX); + return -EINVAL; + } + + if (cb_fn == NULL) { + rte_bbdev_log(ERR, + "NULL callback function cannot be unregistered"); + return -EINVAL; + } + + dev = &rte_bbdev_devices[dev_id]; + rte_spinlock_lock(&rte_bbdev_cb_lock); + + for (cb = TAILQ_FIRST(&dev->list_cbs); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn != cb_fn || cb->event != event || + (cb_arg != (void *)-1 && cb->cb_arg != cb_arg)) + continue; + + /* If this callback is not executing right now, remove it. */ + if (cb->active == 0) { + TAILQ_REMOVE(&(dev->list_cbs), cb, next); + rte_free(cb); + } else + ret = -EAGAIN; + } + + rte_spinlock_unlock(&rte_bbdev_cb_lock); + return ret; +} + +void __rte_experimental +rte_bbdev_pmd_callback_process(struct rte_bbdev *dev, + enum rte_bbdev_event_type event, void *ret_param) +{ + struct rte_bbdev_callback *cb_lst; + struct rte_bbdev_callback dev_cb; + + if (dev == NULL) { + rte_bbdev_log(ERR, "NULL device"); + return; + } + + if (dev->data == NULL) { + rte_bbdev_log(ERR, "NULL data structure"); + return; + } + + if (event >= RTE_BBDEV_EVENT_MAX) { + rte_bbdev_log(ERR, + "Invalid event type (%u), should be less than %u", + event, RTE_BBDEV_EVENT_MAX); + return; + } + + rte_spinlock_lock(&rte_bbdev_cb_lock); + TAILQ_FOREACH(cb_lst, &(dev->list_cbs), next) { + if (cb_lst->cb_fn == NULL || cb_lst->event != event) + continue; + dev_cb = *cb_lst; + cb_lst->active = 1; + if (ret_param != NULL) + dev_cb.ret_param = ret_param; + + rte_spinlock_unlock(&rte_bbdev_cb_lock); + dev_cb.cb_fn(dev->data->dev_id, dev_cb.event, + dev_cb.cb_arg, dev_cb.ret_param); + rte_spinlock_lock(&rte_bbdev_cb_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&rte_bbdev_cb_lock); +} + +int __rte_experimental +rte_bbdev_queue_intr_enable(uint16_t dev_id, uint16_t queue_id) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + VALID_FUNC_OR_RET_ERR(dev->dev_ops->queue_intr_enable, dev_id); + return dev->dev_ops->queue_intr_enable(dev, queue_id); +} + +int __rte_experimental +rte_bbdev_queue_intr_disable(uint16_t dev_id, uint16_t queue_id) +{ + struct rte_bbdev *dev = get_dev(dev_id); + VALID_DEV_OR_RET_ERR(dev, dev_id); + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + VALID_DEV_OPS_OR_RET_ERR(dev, dev_id); + VALID_FUNC_OR_RET_ERR(dev->dev_ops->queue_intr_disable, dev_id); + return dev->dev_ops->queue_intr_disable(dev, queue_id); +} + +int __rte_experimental +rte_bbdev_queue_intr_ctl(uint16_t dev_id, uint16_t queue_id, int epfd, int op, + void *data) +{ + uint32_t vec; + struct rte_bbdev *dev = get_dev(dev_id); + struct rte_intr_handle *intr_handle; + int ret; + + VALID_DEV_OR_RET_ERR(dev, dev_id); + VALID_QUEUE_OR_RET_ERR(queue_id, dev); + + intr_handle = dev->intr_handle; + if (!intr_handle || !intr_handle->intr_vec) { + rte_bbdev_log(ERR, "Device %u intr handle unset\n", dev_id); + return -ENOTSUP; + } + + if (queue_id >= RTE_MAX_RXTX_INTR_VEC_ID) { + rte_bbdev_log(ERR, "Device %u queue_id %u is too big\n", + dev_id, queue_id); + return -ENOTSUP; + } + + vec = intr_handle->intr_vec[queue_id]; + ret = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data); + if (ret && (ret != -EEXIST)) { + rte_bbdev_log(ERR, + "dev %u q %u int ctl error op %d epfd %d vec %u\n", + dev_id, queue_id, op, epfd, vec); + return ret; + } + + return 0; +} + + +const char * __rte_experimental +rte_bbdev_op_type_str(enum rte_bbdev_op_type op_type) +{ + static const char * const op_types[] = { + "RTE_BBDEV_OP_NONE", + "RTE_BBDEV_OP_TURBO_DEC", + "RTE_BBDEV_OP_TURBO_ENC", + }; + + if (op_type < RTE_BBDEV_OP_TYPE_COUNT) + return op_types[op_type]; + + rte_bbdev_log(ERR, "Invalid operation type"); + return NULL; +} + +RTE_INIT(rte_bbdev_init_log) +{ + bbdev_logtype = rte_log_register("lib.bbdev"); + if (bbdev_logtype >= 0) + rte_log_set_level(bbdev_logtype, RTE_LOG_NOTICE); +} diff --git a/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.h b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.h new file mode 100644 index 00000000..25ef409f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev.h @@ -0,0 +1,700 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_BBDEV_H_ +#define _RTE_BBDEV_H_ + +/** + * @file rte_bbdev.h + * + * Wireless base band device abstraction APIs. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * This API allows an application to discover, configure and use a device to + * process operations. An asynchronous API (enqueue, followed by later dequeue) + * is used for processing operations. + * + * The functions in this API are not thread-safe when called on the same + * target object (a device, or a queue on a device), with the exception that + * one thread can enqueue operations to a queue while another thread dequeues + * from the same queue. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_bbdev_op.h" + +#ifndef RTE_BBDEV_MAX_DEVS +#define RTE_BBDEV_MAX_DEVS 128 /**< Max number of devices */ +#endif + +/** Flags indiciate current state of BBDEV device */ +enum rte_bbdev_state { + RTE_BBDEV_UNUSED, + RTE_BBDEV_INITIALIZED +}; + +/** + * Get the total number of devices that have been successfully initialised. + * + * @return + * The total number of usable devices. + */ +uint16_t __rte_experimental +rte_bbdev_count(void); + +/** + * Check if a device is valid. + * + * @param dev_id + * The identifier of the device. + * + * @return + * true if device ID is valid and device is attached, false otherwise. + */ +bool __rte_experimental +rte_bbdev_is_valid(uint16_t dev_id); + +/** + * Get the next enabled device. + * + * @param dev_id + * The current device + * + * @return + * - The next device, or + * - RTE_BBDEV_MAX_DEVS if none found + */ +uint16_t __rte_experimental +rte_bbdev_find_next(uint16_t dev_id); + +/** Iterate through all enabled devices */ +#define RTE_BBDEV_FOREACH(i) for (i = rte_bbdev_find_next(-1); \ + i < RTE_BBDEV_MAX_DEVS; \ + i = rte_bbdev_find_next(i)) + +/** + * Setup up device queues. + * This function must be called on a device before setting up the queues and + * starting the device. It can also be called when a device is in the stopped + * state. If any device queues have been configured their configuration will be + * cleared by a call to this function. + * + * @param dev_id + * The identifier of the device. + * @param num_queues + * Number of queues to configure on device. + * @param socket_id + * ID of a socket which will be used to allocate memory. + * + * @return + * - 0 on success + * - -ENODEV if dev_id is invalid or the device is corrupted + * - -EINVAL if num_queues is invalid, 0 or greater than maximum + * - -EBUSY if the identified device has already started + * - -ENOMEM if unable to allocate memory + */ +int __rte_experimental +rte_bbdev_setup_queues(uint16_t dev_id, uint16_t num_queues, int socket_id); + +/** + * Enable interrupts. + * This function may be called before starting the device to enable the + * interrupts if they are available. + * + * @param dev_id + * The identifier of the device. + * + * @return + * - 0 on success + * - -ENODEV if dev_id is invalid or the device is corrupted + * - -EBUSY if the identified device has already started + * - -ENOTSUP if the interrupts are not supported by the device + */ +int __rte_experimental +rte_bbdev_intr_enable(uint16_t dev_id); + +/** Device queue configuration structure */ +struct rte_bbdev_queue_conf { + int socket; /**< NUMA socket used for memory allocation */ + uint32_t queue_size; /**< Size of queue */ + uint8_t priority; /**< Queue priority */ + bool deferred_start; /**< Do not start queue when device is started. */ + enum rte_bbdev_op_type op_type; /**< Operation type */ +}; + +/** + * Configure a queue on a device. + * This function can be called after device configuration, and before starting. + * It can also be called when the device or the queue is in the stopped state. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * @param conf + * The queue configuration. If NULL, a default configuration will be used. + * + * @return + * - 0 on success + * - EINVAL if the identified queue size or priority are invalid + * - EBUSY if the identified queue or its device have already started + */ +int __rte_experimental +rte_bbdev_queue_configure(uint16_t dev_id, uint16_t queue_id, + const struct rte_bbdev_queue_conf *conf); + +/** + * Start a device. + * This is the last step needed before enqueueing operations is possible. + * + * @param dev_id + * The identifier of the device. + * + * @return + * - 0 on success + * - negative value on failure - as returned from PMD driver + */ +int __rte_experimental +rte_bbdev_start(uint16_t dev_id); + +/** + * Stop a device. + * The device can be reconfigured, and restarted after being stopped. + * + * @param dev_id + * The identifier of the device. + * + * @return + * - 0 on success + */ +int __rte_experimental +rte_bbdev_stop(uint16_t dev_id); + +/** + * Close a device. + * The device cannot be restarted without reconfiguration! + * + * @param dev_id + * The identifier of the device. + * + * @return + * - 0 on success + */ +int __rte_experimental +rte_bbdev_close(uint16_t dev_id); + +/** + * Start a specified queue on a device. + * This is only needed if the queue has been stopped, or if the deferred_start + * flag has been set when configuring the queue. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * + * @return + * - 0 on success + * - negative value on failure - as returned from PMD driver + */ +int __rte_experimental +rte_bbdev_queue_start(uint16_t dev_id, uint16_t queue_id); + +/** + * Stop a specified queue on a device, to allow re configuration. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * + * @return + * - 0 on success + * - negative value on failure - as returned from PMD driver + */ +int __rte_experimental +rte_bbdev_queue_stop(uint16_t dev_id, uint16_t queue_id); + +/** Device statistics. */ +struct rte_bbdev_stats { + uint64_t enqueued_count; /**< Count of all operations enqueued */ + uint64_t dequeued_count; /**< Count of all operations dequeued */ + /** Total error count on operations enqueued */ + uint64_t enqueue_err_count; + /** Total error count on operations dequeued */ + uint64_t dequeue_err_count; + /** Offload time */ + uint64_t offload_time; +}; + +/** + * Retrieve the general I/O statistics of a device. + * + * @param dev_id + * The identifier of the device. + * @param stats + * Pointer to structure to where statistics will be copied. On error, this + * location may or may not have been modified. + * + * @return + * - 0 on success + * - EINVAL if invalid parameter pointer is provided + */ +int __rte_experimental +rte_bbdev_stats_get(uint16_t dev_id, struct rte_bbdev_stats *stats); + +/** + * Reset the statistics of a device. + * + * @param dev_id + * The identifier of the device. + * @return + * - 0 on success + */ +int __rte_experimental +rte_bbdev_stats_reset(uint16_t dev_id); + +/** Device information supplied by the device's driver */ +struct rte_bbdev_driver_info { + /** Driver name */ + const char *driver_name; + + /** Maximum number of queues supported by the device */ + unsigned int max_num_queues; + /** Queue size limit (queue size must also be power of 2) */ + uint32_t queue_size_lim; + /** Set if device off-loads operation to hardware */ + bool hardware_accelerated; + /** Max value supported by queue priority for DL */ + uint8_t max_dl_queue_priority; + /** Max value supported by queue priority for UL */ + uint8_t max_ul_queue_priority; + /** Set if device supports per-queue interrupts */ + bool queue_intr_supported; + /** Minimum alignment of buffers, in bytes */ + uint16_t min_alignment; + /** Default queue configuration used if none is supplied */ + struct rte_bbdev_queue_conf default_queue_conf; + /** Device operation capabilities */ + const struct rte_bbdev_op_cap *capabilities; + /** Device cpu_flag requirements */ + const enum rte_cpu_flag_t *cpu_flag_reqs; +}; + +/** Macro used at end of bbdev PMD list */ +#define RTE_BBDEV_END_OF_CAPABILITIES_LIST() \ + { RTE_BBDEV_OP_NONE } + +/** + * Device information structure used by an application to discover a devices + * capabilities and current configuration + */ +struct rte_bbdev_info { + int socket_id; /**< NUMA socket that device is on */ + const char *dev_name; /**< Unique device name */ + const struct rte_bus *bus; /**< Bus information */ + uint16_t num_queues; /**< Number of queues currently configured */ + bool started; /**< Set if device is currently started */ + struct rte_bbdev_driver_info drv; /**< Info from device driver */ +}; + +/** + * Retrieve information about a device. + * + * @param dev_id + * The identifier of the device. + * @param dev_info + * Pointer to structure to where information will be copied. On error, this + * location may or may not have been modified. + * + * @return + * - 0 on success + * - EINVAL if invalid parameter pointer is provided + */ +int __rte_experimental +rte_bbdev_info_get(uint16_t dev_id, struct rte_bbdev_info *dev_info); + +/** Queue information */ +struct rte_bbdev_queue_info { + /** Current device configuration */ + struct rte_bbdev_queue_conf conf; + /** Set if queue is currently started */ + bool started; +}; + +/** + * Retrieve information about a specific queue on a device. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * @param queue_info + * Pointer to structure to where information will be copied. On error, this + * location may or may not have been modified. + * + * @return + * - 0 on success + * - EINVAL if invalid parameter pointer is provided + */ +int __rte_experimental +rte_bbdev_queue_info_get(uint16_t dev_id, uint16_t queue_id, + struct rte_bbdev_queue_info *queue_info); + +/** @internal The data structure associated with each queue of a device. */ +struct rte_bbdev_queue_data { + void *queue_private; /**< Driver-specific per-queue data */ + struct rte_bbdev_queue_conf conf; /**< Current configuration */ + struct rte_bbdev_stats queue_stats; /**< Queue statistics */ + bool started; /**< Queue state */ +}; + +/** @internal Enqueue encode operations for processing on queue of a device. */ +typedef uint16_t (*rte_bbdev_enqueue_enc_ops_t)( + struct rte_bbdev_queue_data *q_data, + struct rte_bbdev_enc_op **ops, + uint16_t num); + +/** @internal Enqueue decode operations for processing on queue of a device. */ +typedef uint16_t (*rte_bbdev_enqueue_dec_ops_t)( + struct rte_bbdev_queue_data *q_data, + struct rte_bbdev_dec_op **ops, + uint16_t num); + +/** @internal Dequeue encode operations from a queue of a device. */ +typedef uint16_t (*rte_bbdev_dequeue_enc_ops_t)( + struct rte_bbdev_queue_data *q_data, + struct rte_bbdev_enc_op **ops, uint16_t num); + +/** @internal Dequeue decode operations from a queue of a device. */ +typedef uint16_t (*rte_bbdev_dequeue_dec_ops_t)( + struct rte_bbdev_queue_data *q_data, + struct rte_bbdev_dec_op **ops, uint16_t num); + +#define RTE_BBDEV_NAME_MAX_LEN 64 /**< Max length of device name */ + +/** + * @internal The data associated with a device, with no function pointers. + * This structure is safe to place in shared memory to be common among + * different processes in a multi-process configuration. Drivers can access + * these fields, but should never write to them! + */ +struct rte_bbdev_data { + char name[RTE_BBDEV_NAME_MAX_LEN]; /**< Unique identifier name */ + void *dev_private; /**< Driver-specific private data */ + uint16_t num_queues; /**< Number of currently configured queues */ + struct rte_bbdev_queue_data *queues; /**< Queue structures */ + uint16_t dev_id; /**< Device ID */ + int socket_id; /**< NUMA socket that device is on */ + bool started; /**< Device run-time state */ + /** Counter of processes using the device */ + rte_atomic16_t process_cnt; +}; + +/* Forward declarations */ +struct rte_bbdev_ops; +struct rte_bbdev_callback; +struct rte_intr_handle; + +/** Structure to keep track of registered callbacks */ +TAILQ_HEAD(rte_bbdev_cb_list, rte_bbdev_callback); + +/** + * @internal The data structure associated with a device. Drivers can access + * these fields, but should only write to the *_ops fields. + */ +struct __rte_cache_aligned rte_bbdev { + /**< Enqueue encode function */ + rte_bbdev_enqueue_enc_ops_t enqueue_enc_ops; + /**< Enqueue decode function */ + rte_bbdev_enqueue_dec_ops_t enqueue_dec_ops; + /**< Dequeue encode function */ + rte_bbdev_dequeue_enc_ops_t dequeue_enc_ops; + /**< Dequeue decode function */ + rte_bbdev_dequeue_dec_ops_t dequeue_dec_ops; + const struct rte_bbdev_ops *dev_ops; /**< Functions exported by PMD */ + struct rte_bbdev_data *data; /**< Pointer to device data */ + enum rte_bbdev_state state; /**< If device is currently used or not */ + struct rte_device *device; /**< Backing device */ + /** User application callback for interrupts if present */ + struct rte_bbdev_cb_list list_cbs; + struct rte_intr_handle *intr_handle; /**< Device interrupt handle */ +}; + +/** @internal array of all devices */ +extern struct rte_bbdev rte_bbdev_devices[]; + +/** + * Enqueue a burst of processed encode operations to a queue of the device. + * This functions only enqueues as many operations as currently possible and + * does not block until @p num_ops entries in the queue are available. + * This function does not provide any error notification to avoid the + * corresponding overhead. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * @param ops + * Pointer array containing operations to be enqueued Must have at least + * @p num_ops entries + * @param num_ops + * The maximum number of operations to enqueue. + * + * @return + * The number of operations actually enqueued (this is the number of processed + * entries in the @p ops array). + */ +static inline uint16_t +rte_bbdev_enqueue_enc_ops(uint16_t dev_id, uint16_t queue_id, + struct rte_bbdev_enc_op **ops, uint16_t num_ops) +{ + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; + return dev->enqueue_enc_ops(q_data, ops, num_ops); +} + +/** + * Enqueue a burst of processed decode operations to a queue of the device. + * This functions only enqueues as many operations as currently possible and + * does not block until @p num_ops entries in the queue are available. + * This function does not provide any error notification to avoid the + * corresponding overhead. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * @param ops + * Pointer array containing operations to be enqueued Must have at least + * @p num_ops entries + * @param num_ops + * The maximum number of operations to enqueue. + * + * @return + * The number of operations actually enqueued (this is the number of processed + * entries in the @p ops array). + */ +static inline uint16_t +rte_bbdev_enqueue_dec_ops(uint16_t dev_id, uint16_t queue_id, + struct rte_bbdev_dec_op **ops, uint16_t num_ops) +{ + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; + return dev->enqueue_dec_ops(q_data, ops, num_ops); +} + +/** + * Dequeue a burst of processed encode operations from a queue of the device. + * This functions returns only the current contents of the queue, and does not + * block until @ num_ops is available. + * This function does not provide any error notification to avoid the + * corresponding overhead. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * @param ops + * Pointer array where operations will be dequeued to. Must have at least + * @p num_ops entries + * @param num_ops + * The maximum number of operations to dequeue. + * + * @return + * The number of operations actually dequeued (this is the number of entries + * copied into the @p ops array). + */ +static inline uint16_t +rte_bbdev_dequeue_enc_ops(uint16_t dev_id, uint16_t queue_id, + struct rte_bbdev_enc_op **ops, uint16_t num_ops) +{ + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; + return dev->dequeue_enc_ops(q_data, ops, num_ops); +} + +/** + * Dequeue a burst of processed decode operations from a queue of the device. + * This functions returns only the current contents of the queue, and does not + * block until @ num_ops is available. + * This function does not provide any error notification to avoid the + * corresponding overhead. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the queue. + * @param ops + * Pointer array where operations will be dequeued to. Must have at least + * @p num_ops entries + * @param num_ops + * The maximum number of operations to dequeue. + * + * @return + * The number of operations actually dequeued (this is the number of entries + * copied into the @p ops array). + */ + +static inline uint16_t +rte_bbdev_dequeue_dec_ops(uint16_t dev_id, uint16_t queue_id, + struct rte_bbdev_dec_op **ops, uint16_t num_ops) +{ + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; + return dev->dequeue_dec_ops(q_data, ops, num_ops); +} + +/** Definitions of device event types */ +enum rte_bbdev_event_type { + RTE_BBDEV_EVENT_UNKNOWN, /**< unknown event type */ + RTE_BBDEV_EVENT_ERROR, /**< error interrupt event */ + RTE_BBDEV_EVENT_DEQUEUE, /**< dequeue event */ + RTE_BBDEV_EVENT_MAX /**< max value of this enum */ +}; + +/** + * Typedef for application callback function registered by application + * software for notification of device events + * + * @param dev_id + * Device identifier + * @param event + * Device event to register for notification of. + * @param cb_arg + * User specified parameter to be passed to user's callback function. + * @param ret_param + * To pass data back to user application. + */ +typedef void (*rte_bbdev_cb_fn)(uint16_t dev_id, + enum rte_bbdev_event_type event, void *cb_arg, + void *ret_param); + +/** + * Register a callback function for specific device id. Multiple callbacks can + * be added and will be called in the order they are added when an event is + * triggered. Callbacks are called in a separate thread created by the DPDK EAL. + * + * @param dev_id + * Device id. + * @param event + * The event that the callback will be registered for. + * @param cb_fn + * User supplied callback function to be called. + * @param cb_arg + * Pointer to parameter that will be passed to the callback. + * + * @return + * Zero on success, negative value on failure. + */ +int __rte_experimental +rte_bbdev_callback_register(uint16_t dev_id, enum rte_bbdev_event_type event, + rte_bbdev_cb_fn cb_fn, void *cb_arg); + +/** + * Unregister a callback function for specific device id. + * + * @param dev_id + * The device identifier. + * @param event + * The event that the callback will be unregistered for. + * @param cb_fn + * User supplied callback function to be unregistered. + * @param cb_arg + * Pointer to the parameter supplied when registering the callback. + * (void *)-1 means to remove all registered callbacks with the specified + * function address. + * + * @return + * - 0 on success + * - EINVAL if invalid parameter pointer is provided + * - EAGAIN if the provided callback pointer does not exist + */ +int __rte_experimental +rte_bbdev_callback_unregister(uint16_t dev_id, enum rte_bbdev_event_type event, + rte_bbdev_cb_fn cb_fn, void *cb_arg); + +/** + * Enable a one-shot interrupt on the next operation enqueued to a particular + * queue. The interrupt will be triggered when the operation is ready to be + * dequeued. To handle the interrupt, an epoll file descriptor must be + * registered using rte_bbdev_queue_intr_ctl(), and then an application + * thread/lcore can wait for the interrupt using rte_epoll_wait(). + * + * @param dev_id + * The device identifier. + * @param queue_id + * The index of the queue. + * + * @return + * - 0 on success + * - negative value on failure - as returned from PMD driver + */ +int __rte_experimental +rte_bbdev_queue_intr_enable(uint16_t dev_id, uint16_t queue_id); + +/** + * Disable a one-shot interrupt on the next operation enqueued to a particular + * queue (if it has been enabled). + * + * @param dev_id + * The device identifier. + * @param queue_id + * The index of the queue. + * + * @return + * - 0 on success + * - negative value on failure - as returned from PMD driver + */ +int __rte_experimental +rte_bbdev_queue_intr_disable(uint16_t dev_id, uint16_t queue_id); + +/** + * Control interface for per-queue interrupts. + * + * @param dev_id + * The device identifier. + * @param queue_id + * The index of the queue. + * @param epfd + * Epoll file descriptor that will be associated with the interrupt source. + * If the special value RTE_EPOLL_PER_THREAD is provided, a per thread epoll + * file descriptor created by the EAL is used (RTE_EPOLL_PER_THREAD can also + * be used when calling rte_epoll_wait()). + * @param op + * The operation be performed for the vector.RTE_INTR_EVENT_ADD or + * RTE_INTR_EVENT_DEL. + * @param data + * User context, that will be returned in the epdata.data field of the + * rte_epoll_event structure filled in by rte_epoll_wait(). + * + * @return + * - 0 on success + * - ENOTSUP if interrupts are not supported by the identified device + * - negative value on failure - as returned from PMD driver + */ +int __rte_experimental +rte_bbdev_queue_intr_ctl(uint16_t dev_id, uint16_t queue_id, int epfd, int op, + void *data); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BBDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_op.h b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_op.h new file mode 100644 index 00000000..83f62c2d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_op.h @@ -0,0 +1,598 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_BBDEV_OP_H_ +#define _RTE_BBDEV_OP_H_ + +/** + * @file rte_bbdev_op.h + * + * Defines wireless base band layer 1 operations and capabilities + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include + +/* Number of columns in sub-block interleaver (36.212, section 5.1.4.1.1) */ +#define RTE_BBDEV_C_SUBBLOCK (32) +/* Maximum size of Transport Block (36.213, Table, Table 7.1.7.2.5-1) */ +#define RTE_BBDEV_MAX_TB_SIZE (391656) +/* Maximum size of Code Block (36.212, Table 5.1.3-3) */ +#define RTE_BBDEV_MAX_CB_SIZE (6144) +/* Minimum size of Code Block (36.212, Table 5.1.3-3) */ +#define RTE_BBDEV_MIN_CB_SIZE (40) +/* Maximum size of circular buffer */ +#define RTE_BBDEV_MAX_KW (18528) +/* + * Maximum number of Code Blocks in Transport Block. It is calculated based on + * maximum size of one Code Block and one Transport Block (considering CRC24A + * and CRC24B): + * (391656 + 24) / (6144 - 24) = 64 + */ +#define RTE_BBDEV_MAX_CODE_BLOCKS (64) + +/** Flags for turbo decoder operation and capability structure */ +enum rte_bbdev_op_td_flag_bitmasks { + /**< If sub block de-interleaving is to be performed. */ + RTE_BBDEV_TURBO_SUBBLOCK_DEINTERLEAVE = (1ULL << 0), + /**< To use CRC Type 24B (otherwise use CRC Type 24A). */ + RTE_BBDEV_TURBO_CRC_TYPE_24B = (1ULL << 1), + /**< If turbo equalization is to be performed. */ + RTE_BBDEV_TURBO_EQUALIZER = (1ULL << 2), + /**< If set, saturate soft output to +/-127 */ + RTE_BBDEV_TURBO_SOFT_OUT_SATURATE = (1ULL << 3), + /**< Set to 1 to start iteration from even, else odd; one iteration = + * max_iteration + 0.5 + */ + RTE_BBDEV_TURBO_HALF_ITERATION_EVEN = (1ULL << 4), + /**< If 0, TD stops after CRC matches; else if 1, runs to end of next + * odd iteration after CRC matches + */ + RTE_BBDEV_TURBO_CONTINUE_CRC_MATCH = (1ULL << 5), + /**< Set if soft output is required to be output */ + RTE_BBDEV_TURBO_SOFT_OUTPUT = (1ULL << 6), + /**< Set to enable early termination mode */ + RTE_BBDEV_TURBO_EARLY_TERMINATION = (1ULL << 7), + /**< Set if a device supports decoder dequeue interrupts */ + RTE_BBDEV_TURBO_DEC_INTERRUPTS = (1ULL << 9), + /**< Set if positive LLR encoded input is supported. Positive LLR value + * represents the level of confidence for bit '1', and vice versa for + * bit '0'. + * This is mutually exclusive with RTE_BBDEV_TURBO_NEG_LLR_1_BIT_IN + * when used to formalize the input data format. + */ + RTE_BBDEV_TURBO_POS_LLR_1_BIT_IN = (1ULL << 10), + /**< Set if negative LLR encoded input is supported. Negative LLR value + * represents the level of confidence for bit '1', and vice versa for + * bit '0'. + * This is mutually exclusive with RTE_BBDEV_TURBO_POS_LLR_1_BIT_IN + * when used to formalize the input data format. + */ + RTE_BBDEV_TURBO_NEG_LLR_1_BIT_IN = (1ULL << 11), + /**< Set if positive LLR soft output is supported. Positive LLR value + * represents the level of confidence for bit '1', and vice versa for + * bit '0'. + * This is mutually exclusive with + * RTE_BBDEV_TURBO_NEG_LLR_1_BIT_SOFT_OUT when used to formalize + * the input data format. + */ + RTE_BBDEV_TURBO_POS_LLR_1_BIT_SOFT_OUT = (1ULL << 12), + /**< Set if negative LLR soft output is supported. Negative LLR value + * represents the level of confidence for bit '1', and vice versa for + * bit '0'. + * This is mutually exclusive with + * RTE_BBDEV_TURBO_POS_LLR_1_BIT_SOFT_OUT when used to formalize the + * input data format. + */ + RTE_BBDEV_TURBO_NEG_LLR_1_BIT_SOFT_OUT = (1ULL << 13), + /**< Set if driver supports flexible parallel MAP engine decoding. If + * not supported, num_maps (number of MAP engines) argument is unusable. + */ + RTE_BBDEV_TURBO_MAP_DEC = (1ULL << 14), + /**< Set if a device supports scatter-gather functionality */ + RTE_BBDEV_TURBO_DEC_SCATTER_GATHER = (1ULL << 15), + /**< Set to keep CRC24B bits appended while decoding. Only usable when + * decoding Transport Blocks (code_block_mode = 0). + */ + RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP = (1ULL << 16) +}; + +/** Flags for turbo encoder operation and capability structure */ +enum rte_bbdev_op_te_flag_bitmasks { + /**< Ignore rv_index and set K0 = 0 */ + RTE_BBDEV_TURBO_RV_INDEX_BYPASS = (1ULL << 0), + /**< If rate matching is to be performed */ + RTE_BBDEV_TURBO_RATE_MATCH = (1ULL << 1), + /**< This bit must be set to enable CRC-24B generation */ + RTE_BBDEV_TURBO_CRC_24B_ATTACH = (1ULL << 2), + /**< This bit must be set to enable CRC-24A generation */ + RTE_BBDEV_TURBO_CRC_24A_ATTACH = (1ULL << 3), + /**< Set if a device supports encoder dequeue interrupts */ + RTE_BBDEV_TURBO_ENC_INTERRUPTS = (1ULL << 4), + /**< Set if a device supports scatter-gather functionality */ + RTE_BBDEV_TURBO_ENC_SCATTER_GATHER = (1ULL << 5) +}; + +/**< Data input and output buffer for BBDEV operations */ +struct rte_bbdev_op_data { + /**< The mbuf data structure representing the data for BBDEV operation. + * + * This mbuf pointer can point to one Code Block (CB) data buffer or + * multiple CBs contiguously located next to each other. + * A Transport Block (TB) represents a whole piece of data that is + * divided into one or more CBs. Maximum number of CBs can be contained + * in one TB is defined by RTE_BBDEV_MAX_CODE_BLOCKS. + * + * An mbuf data structure cannot represent more than one TB. The + * smallest piece of data that can be contained in one mbuf is one CB. + * An mbuf can include one contiguous CB, subset of contiguous CBs that + * are belonging to one TB, or all contiguous CBs that are belonging to + * one TB. + * + * If a BBDEV PMD supports the extended capability "Scatter-Gather", + * then it is capable of collecting (gathering) non-contiguous + * (scattered) data from multiple locations in the memory. + * This capability is reported by the capability flags: + * - RTE_BBDEV_TURBO_ENC_SCATTER_GATHER and + * - RTE_BBDEV_TURBO_DEC_SCATTER_GATHER. + * Only if a BBDEV PMD supports this feature, chained mbuf data + * structures are accepted. A chained mbuf can represent one + * non-contiguous CB or multiple non-contiguous CBs. + * If BBDEV PMD does not support this feature, it will assume inbound + * mbuf data contains one segment. + * + * The output mbuf data though is always one segment, even if the input + * was a chained mbuf. + */ + struct rte_mbuf *data; + /**< The starting point of the BBDEV (encode/decode) operation, + * in bytes. + * + * BBDEV starts to read data past this offset. + * In case of chained mbuf, this offset applies only to the first mbuf + * segment. + */ + uint32_t offset; + /**< The total data length to be processed in one operation, in bytes. + * + * In case the mbuf data is representing one CB, this is the length of + * the CB undergoing the operation. + * If it's for multiple CBs, this is the total length of those CBs + * undergoing the operation. + * If it's for one TB, this is the total length of the TB under + * operation. + * + * In case of chained mbuf, this data length includes the lengths of the + * "scattered" data segments undergoing the operation. + */ + uint32_t length; +}; + +struct rte_bbdev_op_dec_cb_params { + /**< The K size of the input CB, in bits [40:6144], as specified in + * 3GPP TS 36.212. + * This size is inclusive of CRC bits, regardless whether it was + * pre-calculated by the application or not. + */ + uint16_t k; + /**< The E length of the CB rate matched LLR output, in bytes, as in + * 3GPP TS 36.212. + */ + uint32_t e; +}; + +struct rte_bbdev_op_dec_tb_params { + /**< The K- size of the input CB, in bits [40:6144], that is in the + * Turbo operation when r < C-, as in 3GPP TS 36.212. + */ + uint16_t k_neg; + /**< The K+ size of the input CB, in bits [40:6144], that is in the + * Turbo operation when r >= C-, as in 3GPP TS 36.212. + */ + uint16_t k_pos; + /**< The number of CBs that have K- size, [0:63] */ + uint8_t c_neg; + /**< The total number of CBs in the TB, [1:RTE_BBDEV_MAX_CODE_BLOCKS] */ + uint8_t c; + /**< The number of CBs that uses Ea before switching to Eb, [0:63] */ + uint8_t cab; + /**< The E size of the CB rate matched output to use in the Turbo + * operation when r < cab + */ + uint32_t ea; + /**< The E size of the CB rate matched output to use in the Turbo + * operation when r >= cab + */ + uint32_t eb; +}; + +/**< Operation structure for Turbo decode. + * An operation can perform on one CB at a time "CB-mode". + * An operation can perform on one or multiple CBs that are logically belonging + * to one TB "TB-mode". + * The provided K size parameter of the CB is its size out coming from the + * decode operation. + * CRC24A/B check is requested by the application by setting the flag + * RTE_BBDEV_TURBO_CRC_TYPE_24B for CRC24B check or CRC24A otherwise. + * In TB-mode, BBDEV concatenates the decoded CBs one next to the other with + * relevant CRC24B in between. + * + * The input encoded CB data is the Virtual Circular Buffer data stream, wk, + * with the null padding included as described in 3GPP TS 36.212 + * section 5.1.4.1.2 and shown in 3GPP TS 36.212 section 5.1.4.1 Figure 5.1.4-1. + * The size of the virtual circular buffer is 3*Kpi, where Kpi is the 32 byte + * aligned value of K, as specified in 3GPP TS 36.212 section 5.1.4.1.1. + * + * Each byte in the input circular buffer is the LLR value of each bit of the + * original CB. + * + * Hard output is a mandatory capability that all BBDEV PMDs support. This is + * the decoded CBs of K sizes (CRC24A/B is the last 24-bit in each decoded CB). + * Soft output is an optional capability for BBDEV PMDs. If supported, an LLR + * rate matched output is computed in the soft_output buffer structure. + * + * The output mbuf data structure is expected to be allocated by the + * application with enough room for the output data. + */ +struct rte_bbdev_op_turbo_dec { + /**< The Virtual Circular Buffer, wk, size 3*Kpi for each CB */ + struct rte_bbdev_op_data input; + /**< The hard decisions buffer for the decoded output, + * size K for each CB + */ + struct rte_bbdev_op_data hard_output; + /**< The soft LLR output buffer - optional */ + struct rte_bbdev_op_data soft_output; + + uint32_t op_flags; /**< Flags from rte_bbdev_op_td_flag_bitmasks */ + uint8_t rv_index; /**< Rv index for rate matching [0:3] */ + /**< The minimum number of iterations to perform in decoding all CBs in + * this operation - input + */ + uint8_t iter_min:4; + /**< The maximum number of iterations to perform in decoding all CBs in + * this operation - input + */ + uint8_t iter_max:4; + /**< The maximum number of iterations that were perform in decoding all + * CBs in this decode operation - output + */ + uint8_t iter_count; + /**< 5 bit extrinsic scale (scale factor on extrinsic info) */ + uint8_t ext_scale; + /**< Number of MAP engines to use in decode, + * must be power of 2 (or 0 to auto-select) + */ + uint8_t num_maps; + + uint8_t code_block_mode; /**< [0 - TB : 1 - CB] */ + union { + /**< Struct which stores Code Block specific parameters */ + struct rte_bbdev_op_dec_cb_params cb_params; + /**< Struct which stores Transport Block specific parameters */ + struct rte_bbdev_op_dec_tb_params tb_params; + }; +}; + +struct rte_bbdev_op_enc_cb_params { + /**< The K size of the input CB, in bits [40:6144], as specified in + * 3GPP TS 36.212. + * This size is inclusive of CRC24A, regardless whether it was + * pre-calculated by the application or not. + */ + uint16_t k; + /**< The E length of the CB rate matched output, in bits, as in + * 3GPP TS 36.212. + */ + uint32_t e; + /**< The Ncb soft buffer size of the CB rate matched output [K:3*Kpi], + * in bits, as specified in 3GPP TS 36.212. + */ + uint16_t ncb; +}; + +struct rte_bbdev_op_enc_tb_params { + /**< The K- size of the input CB, in bits [40:6144], that is in the + * Turbo operation when r < C-, as in 3GPP TS 36.212. + * This size is inclusive of CRC24B, regardless whether it was + * pre-calculated and appended by the application or not. + */ + uint16_t k_neg; + /**< The K+ size of the input CB, in bits [40:6144], that is in the + * Turbo operation when r >= C-, as in 3GPP TS 36.212. + * This size is inclusive of CRC24B, regardless whether it was + * pre-calculated and appended by the application or not. + */ + uint16_t k_pos; + /**< The number of CBs that have K- size, [0:63] */ + uint8_t c_neg; + /**< The total number of CBs in the TB, [1:RTE_BBDEV_MAX_CODE_BLOCKS] */ + uint8_t c; + /**< The number of CBs that uses Ea before switching to Eb, [0:63] */ + uint8_t cab; + /**< The E size of the CB rate matched output to use in the Turbo + * operation when r < cab + */ + uint32_t ea; + /**< The E size of the CB rate matched output to use in the Turbo + * operation when r >= cab + */ + uint32_t eb; + /**< The Ncb soft buffer size for the rate matched CB that is used in + * the Turbo operation when r < C-, [K:3*Kpi] + */ + uint16_t ncb_neg; + /**< The Ncb soft buffer size for the rate matched CB that is used in + * the Turbo operation when r >= C-, [K:3*Kpi] + */ + uint16_t ncb_pos; + /**< The index of the first CB in the inbound mbuf data, default is 0 */ + uint8_t r; +}; + +/**< Operation structure for Turbo encode. + * An operation can perform on one CB at a time "CB-mode". + * An operation can perform on one or multiple CBs that are logically + * belonging to one TB "TB-mode". + * + * In CB-mode, CRC24A/B is an optional operation. K size parameter is not + * affected by CRC24A/B inclusion, this only affects the inbound mbuf data + * length. Not all BBDEV PMDs are capable of CRC24A/B calculation. Flags + * RTE_BBDEV_TURBO_CRC_24A_ATTACH and RTE_BBDEV_TURBO_CRC_24B_ATTACH informs + * the application with relevant capability. These flags can be set in the + * op_flags parameter to indicate BBDEV to calculate and append CRC24A to CB + * before going forward with Turbo encoding. + * + * In TB-mode, CRC24A is assumed to be pre-calculated and appended to the + * inbound TB mbuf data buffer. + * + * The output mbuf data structure is expected to be allocated by the + * application with enough room for the output data. + */ +struct rte_bbdev_op_turbo_enc { + /**< The input CB or TB data */ + struct rte_bbdev_op_data input; + /**< The rate matched CB or TB output buffer */ + struct rte_bbdev_op_data output; + + uint32_t op_flags; /**< Flags from rte_bbdev_op_te_flag_bitmasks */ + uint8_t rv_index; /**< Rv index for rate matching [0:3] */ + + uint8_t code_block_mode; /**< [0 - TB : 1 - CB] */ + union { + /**< Struct which stores Code Block specific parameters */ + struct rte_bbdev_op_enc_cb_params cb_params; + /**< Struct which stores Transport Block specific parameters */ + struct rte_bbdev_op_enc_tb_params tb_params; + }; +}; + +/**< List of the capabilities for the Turbo Decoder */ +struct rte_bbdev_op_cap_turbo_dec { + /**< Flags from rte_bbdev_op_td_flag_bitmasks */ + uint32_t capability_flags; + /** Maximal LLR absolute value. Acceptable LLR values lie in range + * [-max_llr_modulus, max_llr_modulus]. + */ + int8_t max_llr_modulus; + uint8_t num_buffers_src; /**< Num input code block buffers */ + /**< Num hard output code block buffers */ + uint8_t num_buffers_hard_out; + /**< Num soft output code block buffers if supported by the driver */ + uint8_t num_buffers_soft_out; +}; + +/**< List of the capabilities for the Turbo Encoder */ +struct rte_bbdev_op_cap_turbo_enc { + /**< Flags from rte_bbdev_op_te_flag_bitmasks */ + uint32_t capability_flags; + uint8_t num_buffers_src; /**< Num input code block buffers */ + uint8_t num_buffers_dst; /**< Num output code block buffers */ +}; + +/** Different operation types supported by the device */ +enum rte_bbdev_op_type { + RTE_BBDEV_OP_NONE, /**< Dummy operation that does nothing */ + RTE_BBDEV_OP_TURBO_DEC, /**< Turbo decode */ + RTE_BBDEV_OP_TURBO_ENC, /**< Turbo encode */ + RTE_BBDEV_OP_TYPE_COUNT, /**< Count of different op types */ +}; + +/**< Bit indexes of possible errors reported through status field */ +enum { + RTE_BBDEV_DRV_ERROR, + RTE_BBDEV_DATA_ERROR, + RTE_BBDEV_CRC_ERROR, +}; + +/**< Structure specifying a single encode operation */ +struct rte_bbdev_enc_op { + int status; /**< Status of operation that was performed */ + struct rte_mempool *mempool; /**< Mempool which op instance is in */ + void *opaque_data; /**< Opaque pointer for user data */ + /**< Contains encoder specific parameters */ + struct rte_bbdev_op_turbo_enc turbo_enc; +}; + +/**< Structure specifying a single decode operation */ +struct rte_bbdev_dec_op { + int status; /**< Status of operation that was performed */ + struct rte_mempool *mempool; /**< Mempool which op instance is in */ + void *opaque_data; /**< Opaque pointer for user data */ + /**< Contains decoder specific parameters */ + struct rte_bbdev_op_turbo_dec turbo_dec; +}; + +/**< Operation capabilities supported by a device */ +struct rte_bbdev_op_cap { + enum rte_bbdev_op_type type; /**< Type of operation */ + union { + struct rte_bbdev_op_cap_turbo_dec turbo_dec; + struct rte_bbdev_op_cap_turbo_enc turbo_enc; + } cap; /**< Operation-type specific capabilities */ +}; + +/**< @internal Private data structure stored with operation pool. */ +struct rte_bbdev_op_pool_private { + enum rte_bbdev_op_type type; /**< Type of operations in a pool */ +}; + +/** + * Converts queue operation type from enum to string + * + * @param op_type + * Operation type as enum + * + * @returns + * Operation type as string or NULL if op_type is invalid + * + */ +const char* +rte_bbdev_op_type_str(enum rte_bbdev_op_type op_type); + +/** + * Creates a bbdev operation mempool + * + * @param name + * Pool name. + * @param type + * Operation type, use RTE_BBDEV_OP_NONE for a pool which supports all + * operation types. + * @param num_elements + * Number of elements in the pool. + * @param cache_size + * Number of elements to cache on an lcore, see rte_mempool_create() for + * further details about cache size. + * @param socket_id + * Socket to allocate memory on. + * + * @return + * - Pointer to a mempool on success, + * - NULL pointer on failure. + */ +struct rte_mempool * +rte_bbdev_op_pool_create(const char *name, enum rte_bbdev_op_type type, + unsigned int num_elements, unsigned int cache_size, + int socket_id); + +/** + * Bulk allocate encode operations from a mempool with parameter defaults reset. + * + * @param mempool + * Operation mempool, created by rte_bbdev_op_pool_create(). + * @param ops + * Output array to place allocated operations + * @param num_ops + * Number of operations to allocate + * + * @returns + * - 0 on success + * - EINVAL if invalid mempool is provided + */ +static inline int +rte_bbdev_enc_op_alloc_bulk(struct rte_mempool *mempool, + struct rte_bbdev_enc_op **ops, uint16_t num_ops) +{ + struct rte_bbdev_op_pool_private *priv; + int ret; + + /* Check type */ + priv = (struct rte_bbdev_op_pool_private *) + rte_mempool_get_priv(mempool); + if (unlikely(priv->type != RTE_BBDEV_OP_TURBO_ENC)) + return -EINVAL; + + /* Get elements */ + ret = rte_mempool_get_bulk(mempool, (void **)ops, num_ops); + if (unlikely(ret < 0)) + return ret; + + return 0; +} + +/** + * Bulk allocate decode operations from a mempool with parameter defaults reset. + * + * @param mempool + * Operation mempool, created by rte_bbdev_op_pool_create(). + * @param ops + * Output array to place allocated operations + * @param num_ops + * Number of operations to allocate + * + * @returns + * - 0 on success + * - EINVAL if invalid mempool is provided + */ +static inline int +rte_bbdev_dec_op_alloc_bulk(struct rte_mempool *mempool, + struct rte_bbdev_dec_op **ops, uint16_t num_ops) +{ + struct rte_bbdev_op_pool_private *priv; + int ret; + + /* Check type */ + priv = (struct rte_bbdev_op_pool_private *) + rte_mempool_get_priv(mempool); + if (unlikely(priv->type != RTE_BBDEV_OP_TURBO_DEC)) + return -EINVAL; + + /* Get elements */ + ret = rte_mempool_get_bulk(mempool, (void **)ops, num_ops); + if (unlikely(ret < 0)) + return ret; + + return 0; +} + +/** + * Free decode operation structures that were allocated by + * rte_bbdev_dec_op_alloc_bulk(). + * All structures must belong to the same mempool. + * + * @param ops + * Operation structures + * @param num_ops + * Number of structures + */ +static inline void +rte_bbdev_dec_op_free_bulk(struct rte_bbdev_dec_op **ops, unsigned int num_ops) +{ + if (num_ops > 0) + rte_mempool_put_bulk(ops[0]->mempool, (void **)ops, num_ops); +} + +/** + * Free encode operation structures that were allocated by + * rte_bbdev_enc_op_alloc_bulk(). + * All structures must belong to the same mempool. + * + * @param ops + * Operation structures + * @param num_ops + * Number of structures + */ +static inline void +rte_bbdev_enc_op_free_bulk(struct rte_bbdev_enc_op **ops, unsigned int num_ops) +{ + if (num_ops > 0) + rte_mempool_put_bulk(ops[0]->mempool, (void **)ops, num_ops); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BBDEV_OP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h new file mode 100644 index 00000000..db9a04cd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_BBDEV_PMD_H_ +#define _RTE_BBDEV_PMD_H_ + +/** + * @file rte_bbdev_pmd.h + * + * Wireless base band driver-facing APIs. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * This API provides the mechanism for device drivers to register with the + * bbdev interface. User applications should not use this API. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "rte_bbdev.h" + +/** Suggested value for SW based devices */ +#define RTE_BBDEV_DEFAULT_MAX_NB_QUEUES RTE_MAX_LCORE + +/** Suggested value for SW based devices */ +#define RTE_BBDEV_QUEUE_SIZE_LIMIT 16384 + +/** + * @internal + * Allocates a new slot for a bbdev and returns the pointer to that slot + * for the driver to use. + * + * @param name + * Unique identifier name for each bbdev device + * + * @return + * - Slot in the rte_bbdev array for a new device; + */ +struct rte_bbdev * __rte_experimental +rte_bbdev_allocate(const char *name); + +/** + * @internal + * Release the specified bbdev. + * + * @param bbdev + * The *bbdev* pointer is the address of the *rte_bbdev* structure. + * @return + * - 0 on success, negative on error + */ +int __rte_experimental +rte_bbdev_release(struct rte_bbdev *bbdev); + +/** + * Get the device structure for a named device. + * + * @param name + * Name of the device + * + * @return + * - The device structure pointer, or + * - NULL otherwise + * + */ +struct rte_bbdev * __rte_experimental +rte_bbdev_get_named_dev(const char *name); + +/** + * Definitions of all functions exported by a driver through the the generic + * structure of type *rte_bbdev_ops* supplied in the *rte_bbdev* structure + * associated with a device. + */ + +/** @internal Function used to configure device memory. */ +typedef int (*rte_bbdev_setup_queues_t)(struct rte_bbdev *dev, + uint16_t num_queues, int socket_id); + +/** @internal Function used to configure interrupts for a device. */ +typedef int (*rte_bbdev_intr_enable_t)(struct rte_bbdev *dev); + +/** @internal Function to allocate and configure a device queue. */ +typedef int (*rte_bbdev_queue_setup_t)(struct rte_bbdev *dev, + uint16_t queue_id, const struct rte_bbdev_queue_conf *conf); + +/* + * @internal + * Function to release memory resources allocated for a device queue. + */ +typedef int (*rte_bbdev_queue_release_t)(struct rte_bbdev *dev, + uint16_t queue_id); + +/** @internal Function to start a configured device. */ +typedef int (*rte_bbdev_start_t)(struct rte_bbdev *dev); + +/** @internal Function to stop a device. */ +typedef void (*rte_bbdev_stop_t)(struct rte_bbdev *dev); + +/** @internal Function to close a device. */ +typedef int (*rte_bbdev_close_t)(struct rte_bbdev *dev); + +/** @internal Function to start a device queue. */ +typedef int (*rte_bbdev_queue_start_t)(struct rte_bbdev *dev, + uint16_t queue_id); + +/** @internal Function to stop a device queue. */ +typedef int (*rte_bbdev_queue_stop_t)(struct rte_bbdev *dev, uint16_t queue_id); + +/** @internal Function to read stats from a device. */ +typedef void (*rte_bbdev_stats_get_t)(struct rte_bbdev *dev, + struct rte_bbdev_stats *stats); + +/** @internal Function to reset stats on a device. */ +typedef void (*rte_bbdev_stats_reset_t)(struct rte_bbdev *dev); + +/** @internal Function to retrieve specific information of a device. */ +typedef void (*rte_bbdev_info_get_t)(struct rte_bbdev *dev, + struct rte_bbdev_driver_info *dev_info); + +/* + * @internal + * Function to enable interrupt for next op on a queue of a device. + */ +typedef int (*rte_bbdev_queue_intr_enable_t)(struct rte_bbdev *dev, + uint16_t queue_id); + +/* + * @internal + * Function to disable interrupt for next op on a queue of a device. + */ +typedef int (*rte_bbdev_queue_intr_disable_t)(struct rte_bbdev *dev, + uint16_t queue_id); + +/** + * Operations implemented by drivers. Fields marked as "Required" must be + * provided by a driver for a device to have basic functionality. "Optional" + * fields are for non-vital operations + */ +struct rte_bbdev_ops { + /**< Allocate and configure device memory. Optional. */ + rte_bbdev_setup_queues_t setup_queues; + /**< Configure interrupts. Optional. */ + rte_bbdev_intr_enable_t intr_enable; + /**< Start device. Optional. */ + rte_bbdev_start_t start; + /**< Stop device. Optional. */ + rte_bbdev_stop_t stop; + /**< Close device. Optional. */ + rte_bbdev_close_t close; + + /**< Get device info. Required. */ + rte_bbdev_info_get_t info_get; + /** Get device statistics. Optional. */ + rte_bbdev_stats_get_t stats_get; + /** Reset device statistics. Optional. */ + rte_bbdev_stats_reset_t stats_reset; + + /** Set up a device queue. Required. */ + rte_bbdev_queue_setup_t queue_setup; + /** Release a queue. Required. */ + rte_bbdev_queue_release_t queue_release; + /** Start a queue. Optional. */ + rte_bbdev_queue_start_t queue_start; + /**< Stop a queue pair. Optional. */ + rte_bbdev_queue_stop_t queue_stop; + + /** Enable queue interrupt. Optional */ + rte_bbdev_queue_intr_enable_t queue_intr_enable; + /** Disable queue interrupt. Optional */ + rte_bbdev_queue_intr_disable_t queue_intr_disable; +}; + +/** + * Executes all the user application registered callbacks for the specific + * device and event type. + * + * @param dev + * Pointer to the device structure. + * @param event + * Event type. + * @param ret_param + * To pass data back to user application. + */ +void __rte_experimental +rte_bbdev_pmd_callback_process(struct rte_bbdev *dev, + enum rte_bbdev_event_type event, void *ret_param); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BBDEV_PMD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_version.map b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_version.map new file mode 100644 index 00000000..d3b81eab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bbdev/rte_bbdev_version.map @@ -0,0 +1,37 @@ +EXPERIMENTAL { + global: + + rte_bbdev_allocate; + rte_bbdev_callback_register; + rte_bbdev_callback_unregister; + rte_bbdev_close; + rte_bbdev_count; + rte_bbdev_dequeue_dec_ops; + rte_bbdev_dequeue_enc_ops; + rte_bbdev_devices; + rte_bbdev_enqueue_dec_ops; + rte_bbdev_enqueue_enc_ops; + rte_bbdev_find_next; + rte_bbdev_get_named_dev; + rte_bbdev_info_get; + rte_bbdev_intr_enable; + rte_bbdev_is_valid; + rte_bbdev_op_pool_create; + rte_bbdev_op_type_str; + rte_bbdev_pmd_callback_process; + rte_bbdev_queue_configure; + rte_bbdev_queue_info_get; + rte_bbdev_queue_intr_ctl; + rte_bbdev_queue_intr_disable; + rte_bbdev_queue_intr_enable; + rte_bbdev_queue_start; + rte_bbdev_queue_stop; + rte_bbdev_release; + rte_bbdev_setup_queues; + rte_bbdev_start; + rte_bbdev_stats_get; + rte_bbdev_stats_reset; + rte_bbdev_stop; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_bitratestats/Makefile b/src/spdk/dpdk/lib/librte_bitratestats/Makefile new file mode 100644 index 00000000..59318388 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bitratestats/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_bitratestats.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_eal -lrte_metrics -lrte_ethdev + +EXPORT_MAP := rte_bitratestats_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_BITRATE) := rte_bitrate.c + +# Install header file +SYMLINK-$(CONFIG_RTE_LIBRTE_BITRATE)-include += rte_bitrate.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_bitratestats/meson.build b/src/spdk/dpdk/lib/librte_bitratestats/meson.build new file mode 100644 index 00000000..c35b62b3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bitratestats/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('rte_bitrate.c') +headers = files('rte_bitrate.h') +deps += ['ethdev', 'metrics'] diff --git a/src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.c b/src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.c new file mode 100644 index 00000000..c4b28f62 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.c @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +/* + * Persistent bit-rate data. + * @internal + */ +struct rte_stats_bitrate { + uint64_t last_ibytes; + uint64_t last_obytes; + uint64_t peak_ibits; + uint64_t peak_obits; + uint64_t mean_ibits; + uint64_t mean_obits; + uint64_t ewma_ibits; + uint64_t ewma_obits; +}; + +struct rte_stats_bitrates { + struct rte_stats_bitrate port_stats[RTE_MAX_ETHPORTS]; + uint16_t id_stats_set; +}; + +struct rte_stats_bitrates * +rte_stats_bitrate_create(void) +{ + return rte_zmalloc(NULL, sizeof(struct rte_stats_bitrates), + RTE_CACHE_LINE_SIZE); +} + +int +rte_stats_bitrate_reg(struct rte_stats_bitrates *bitrate_data) +{ + const char * const names[] = { + "ewma_bits_in", "ewma_bits_out", + "mean_bits_in", "mean_bits_out", + "peak_bits_in", "peak_bits_out", + }; + int return_value; + + if (bitrate_data == NULL) + return -EINVAL; + + return_value = rte_metrics_reg_names(&names[0], ARRAY_SIZE(names)); + if (return_value >= 0) + bitrate_data->id_stats_set = return_value; + return return_value; +} + +int +rte_stats_bitrate_calc(struct rte_stats_bitrates *bitrate_data, + uint16_t port_id) +{ + struct rte_stats_bitrate *port_data; + struct rte_eth_stats eth_stats; + int ret_code; + uint64_t cnt_bits; + int64_t delta; + const int64_t alpha_percent = 20; + uint64_t values[6]; + + if (bitrate_data == NULL) + return -EINVAL; + + ret_code = rte_eth_stats_get(port_id, ð_stats); + if (ret_code != 0) + return ret_code; + + port_data = &bitrate_data->port_stats[port_id]; + + /* Incoming bitrate. This is an iteratively calculated EWMA + * (Exponentially Weighted Moving Average) that uses a + * weighting factor of alpha_percent. An unsmoothed mean + * for just the current time delta is also calculated for the + * benefit of people who don't understand signal processing. + */ + cnt_bits = (eth_stats.ibytes - port_data->last_ibytes) << 3; + port_data->last_ibytes = eth_stats.ibytes; + if (cnt_bits > port_data->peak_ibits) + port_data->peak_ibits = cnt_bits; + delta = cnt_bits; + delta -= port_data->ewma_ibits; + /* The +-50 fixes integer rounding during division */ + if (delta > 0) + delta = (delta * alpha_percent + 50) / 100; + else + delta = (delta * alpha_percent - 50) / 100; + port_data->ewma_ibits += delta; + /* Integer roundoff prevents EWMA between 0 and (100/alpha_percent) + * ever reaching zero in no-traffic conditions + */ + if (cnt_bits == 0 && delta == 0) + port_data->ewma_ibits = 0; + port_data->mean_ibits = cnt_bits; + + /* Outgoing bitrate (also EWMA) */ + cnt_bits = (eth_stats.obytes - port_data->last_obytes) << 3; + port_data->last_obytes = eth_stats.obytes; + if (cnt_bits > port_data->peak_obits) + port_data->peak_obits = cnt_bits; + delta = cnt_bits; + delta -= port_data->ewma_obits; + if (delta > 0) + delta = (delta * alpha_percent + 50) / 100; + else + delta = (delta * alpha_percent - 50) / 100; + port_data->ewma_obits += delta; + if (cnt_bits == 0 && delta == 0) + port_data->ewma_obits = 0; + port_data->mean_obits = cnt_bits; + + values[0] = port_data->ewma_ibits; + values[1] = port_data->ewma_obits; + values[2] = port_data->mean_ibits; + values[3] = port_data->mean_obits; + values[4] = port_data->peak_ibits; + values[5] = port_data->peak_obits; + rte_metrics_update_values(port_id, bitrate_data->id_stats_set, + values, ARRAY_SIZE(values)); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.h b/src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.h new file mode 100644 index 00000000..ef10f22f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bitratestats/rte_bitrate.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_BITRATE_H_ +#define _RTE_BITRATE_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Bitrate statistics data structure. + * This data structure is intentionally opaque. + */ +struct rte_stats_bitrates; + + +/** + * Allocate a bitrate statistics structure + * + * @return + * - Pointer to structure on success + * - NULL on error (zmalloc failure) + */ +struct rte_stats_bitrates *rte_stats_bitrate_create(void); + + +/** + * Register bitrate statistics with the metric library. + * + * @param bitrate_data + * Pointer allocated by rte_stats_create() + * + * @return + * Zero on success + * Negative on error + */ +int rte_stats_bitrate_reg(struct rte_stats_bitrates *bitrate_data); + + +/** + * Calculate statistics for current time window. The period with which + * this function is called should be the intended sampling window width. + * + * @param bitrate_data + * Bitrate statistics data pointer + * + * @param port_id + * Port id to calculate statistics for + * + * @return + * - Zero on success + * - Negative value on error + */ +int rte_stats_bitrate_calc(struct rte_stats_bitrates *bitrate_data, + uint16_t port_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BITRATE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bitratestats/rte_bitratestats_version.map b/src/spdk/dpdk/lib/librte_bitratestats/rte_bitratestats_version.map new file mode 100644 index 00000000..fe745445 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bitratestats/rte_bitratestats_version.map @@ -0,0 +1,9 @@ +DPDK_17.05 { + global: + + rte_stats_bitrate_calc; + rte_stats_bitrate_create; + rte_stats_bitrate_reg; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_bpf/Makefile b/src/spdk/dpdk/lib/librte_bpf/Makefile new file mode 100644 index 00000000..c0e8aaa6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/Makefile @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_bpf.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +CFLAGS += -DALLOW_EXPERIMENTAL_API +LDLIBS += -lrte_net -lrte_eal +LDLIBS += -lrte_mempool -lrte_ring +LDLIBS += -lrte_mbuf -lrte_ethdev +ifeq ($(CONFIG_RTE_LIBRTE_BPF_ELF),y) +LDLIBS += -lelf +endif + +EXPORT_MAP := rte_bpf_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_pkt.c +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c +ifeq ($(CONFIG_RTE_LIBRTE_BPF_ELF),y) +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load_elf.c +endif +ifeq ($(CONFIG_RTE_ARCH_X86_64),y) +SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_jit_x86.c +endif + +# install header files +SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += bpf_def.h +SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h +SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf_ethdev.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf.c b/src/spdk/dpdk/lib/librte_bpf/bpf.c new file mode 100644 index 00000000..f590c8c3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bpf_impl.h" + +int rte_bpf_logtype; + +__rte_experimental void +rte_bpf_destroy(struct rte_bpf *bpf) +{ + if (bpf != NULL) { + if (bpf->jit.func != NULL) + munmap(bpf->jit.func, bpf->jit.sz); + munmap(bpf, bpf->sz); + } +} + +__rte_experimental int +rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit) +{ + if (bpf == NULL || jit == NULL) + return -EINVAL; + + jit[0] = bpf->jit; + return 0; +} + +int +bpf_jit(struct rte_bpf *bpf) +{ + int32_t rc; + +#ifdef RTE_ARCH_X86_64 + rc = bpf_jit_x86(bpf); +#else + rc = -ENOTSUP; +#endif + + if (rc != 0) + RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n", + __func__, bpf, rc); + return rc; +} + +RTE_INIT(rte_bpf_init_log) +{ + rte_bpf_logtype = rte_log_register("lib.bpf"); + if (rte_bpf_logtype >= 0) + rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO); +} diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_def.h b/src/spdk/dpdk/lib/librte_bpf/bpf_def.h new file mode 100644 index 00000000..c10f3aec --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_def.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef _RTE_BPF_DEF_H_ +#define _RTE_BPF_DEF_H_ + +/** + * @file + * + * classic BPF (cBPF) and extended BPF (eBPF) related defines. + * For more information regarding cBPF and eBPF ISA and their differences, + * please refer to: + * https://www.kernel.org/doc/Documentation/networking/filter.txt. + * As a rule of thumb for that file: + * all definitions used by both cBPF and eBPF start with bpf(BPF)_ prefix, + * while eBPF only ones start with ebpf(EBPF)) prefix. + */ + +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The instruction encodings. + */ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +#define EBPF_ALU64 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 +#define BPF_H 0x08 +#define BPF_B 0x10 +#define EBPF_DW 0x18 + +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +#define EBPF_XADD 0xc0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define EBPF_MOV 0xb0 +#define EBPF_ARSH 0xc0 +#define EBPF_END 0xd0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 + +#define EBPF_JNE 0x50 +#define EBPF_JSGT 0x60 +#define EBPF_JSGE 0x70 +#define EBPF_CALL 0x80 +#define EBPF_EXIT 0x90 +#define EBPF_JLT 0xa0 +#define EBPF_JLE 0xb0 +#define EBPF_JSLT 0xc0 +#define EBPF_JSLE 0xd0 + +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +/* if BPF_OP(code) == EBPF_END */ +#define EBPF_TO_LE 0x00 /* convert to little-endian */ +#define EBPF_TO_BE 0x08 /* convert to big-endian */ + +/* + * eBPF registers + */ +enum { + EBPF_REG_0, /* return value from internal function/for eBPF program */ + EBPF_REG_1, /* 0-th argument to internal function */ + EBPF_REG_2, /* 1-th argument to internal function */ + EBPF_REG_3, /* 2-th argument to internal function */ + EBPF_REG_4, /* 3-th argument to internal function */ + EBPF_REG_5, /* 4-th argument to internal function */ + EBPF_REG_6, /* callee saved register */ + EBPF_REG_7, /* callee saved register */ + EBPF_REG_8, /* callee saved register */ + EBPF_REG_9, /* callee saved register */ + EBPF_REG_10, /* stack pointer (read-only) */ + EBPF_REG_NUM, +}; + +/* + * eBPF instruction format + */ +struct ebpf_insn { + uint8_t code; + uint8_t dst_reg:4; + uint8_t src_reg:4; + int16_t off; + int32_t imm; +}; + +/* + * eBPF allows functions with R1-R5 as arguments. + */ +#define EBPF_FUNC_MAX_ARGS (EBPF_REG_6 - EBPF_REG_1) + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_BPF_DEF_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_exec.c b/src/spdk/dpdk/lib/librte_bpf/bpf_exec.c new file mode 100644 index 00000000..6a79139c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_exec.c @@ -0,0 +1,453 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +#define BPF_JMP_UNC(ins) ((ins) += (ins)->off) + +#define BPF_JMP_CND_REG(reg, ins, op, type) \ + ((ins) += \ + ((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \ + (ins)->off : 0) + +#define BPF_JMP_CND_IMM(reg, ins, op, type) \ + ((ins) += \ + ((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \ + (ins)->off : 0) + +#define BPF_NEG_ALU(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg])) + +#define EBPF_MOV_ALU_REG(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg]) + +#define BPF_OP_ALU_REG(reg, ins, op, type) \ + ((reg)[(ins)->dst_reg] = \ + (type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) + +#define EBPF_MOV_ALU_IMM(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = (type)(ins)->imm) + +#define BPF_OP_ALU_IMM(reg, ins, op, type) \ + ((reg)[(ins)->dst_reg] = \ + (type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) + +#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \ + if ((type)(reg)[(ins)->src_reg] == 0) { \ + RTE_BPF_LOG(ERR, \ + "%s(%p): division by 0 at pc: %#zx;\n", \ + __func__, bpf, \ + (uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \ + return 0; \ + } \ +} while (0) + +#define BPF_LD_REG(reg, ins, type) \ + ((reg)[(ins)->dst_reg] = \ + *(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off)) + +#define BPF_ST_IMM(reg, ins, type) \ + (*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \ + (type)(ins)->imm) + +#define BPF_ST_REG(reg, ins, type) \ + (*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \ + (type)(reg)[(ins)->src_reg]) + +#define BPF_ST_XADD_REG(reg, ins, tp) \ + (rte_atomic##tp##_add((rte_atomic##tp##_t *) \ + (uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \ + reg[ins->src_reg])) + +static inline void +bpf_alu_be(uint64_t reg[EBPF_REG_NUM], const struct ebpf_insn *ins) +{ + uint64_t *v; + + v = reg + ins->dst_reg; + switch (ins->imm) { + case 16: + *v = rte_cpu_to_be_16(*v); + break; + case 32: + *v = rte_cpu_to_be_32(*v); + break; + case 64: + *v = rte_cpu_to_be_64(*v); + break; + } +} + +static inline void +bpf_alu_le(uint64_t reg[EBPF_REG_NUM], const struct ebpf_insn *ins) +{ + uint64_t *v; + + v = reg + ins->dst_reg; + switch (ins->imm) { + case 16: + *v = rte_cpu_to_le_16(*v); + break; + case 32: + *v = rte_cpu_to_le_32(*v); + break; + case 64: + *v = rte_cpu_to_le_64(*v); + break; + } +} + +static inline uint64_t +bpf_exec(const struct rte_bpf *bpf, uint64_t reg[EBPF_REG_NUM]) +{ + const struct ebpf_insn *ins; + + for (ins = bpf->prm.ins; ; ins++) { + switch (ins->code) { + /* 32 bit ALU IMM operations */ + case (BPF_ALU | BPF_ADD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, +, uint32_t); + break; + case (BPF_ALU | BPF_SUB | BPF_K): + BPF_OP_ALU_IMM(reg, ins, -, uint32_t); + break; + case (BPF_ALU | BPF_AND | BPF_K): + BPF_OP_ALU_IMM(reg, ins, &, uint32_t); + break; + case (BPF_ALU | BPF_OR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, |, uint32_t); + break; + case (BPF_ALU | BPF_LSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, <<, uint32_t); + break; + case (BPF_ALU | BPF_RSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, >>, uint32_t); + break; + case (BPF_ALU | BPF_XOR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, ^, uint32_t); + break; + case (BPF_ALU | BPF_MUL | BPF_K): + BPF_OP_ALU_IMM(reg, ins, *, uint32_t); + break; + case (BPF_ALU | BPF_DIV | BPF_K): + BPF_OP_ALU_IMM(reg, ins, /, uint32_t); + break; + case (BPF_ALU | BPF_MOD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, %, uint32_t); + break; + case (BPF_ALU | EBPF_MOV | BPF_K): + EBPF_MOV_ALU_IMM(reg, ins, uint32_t); + break; + /* 32 bit ALU REG operations */ + case (BPF_ALU | BPF_ADD | BPF_X): + BPF_OP_ALU_REG(reg, ins, +, uint32_t); + break; + case (BPF_ALU | BPF_SUB | BPF_X): + BPF_OP_ALU_REG(reg, ins, -, uint32_t); + break; + case (BPF_ALU | BPF_AND | BPF_X): + BPF_OP_ALU_REG(reg, ins, &, uint32_t); + break; + case (BPF_ALU | BPF_OR | BPF_X): + BPF_OP_ALU_REG(reg, ins, |, uint32_t); + break; + case (BPF_ALU | BPF_LSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, <<, uint32_t); + break; + case (BPF_ALU | BPF_RSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, >>, uint32_t); + break; + case (BPF_ALU | BPF_XOR | BPF_X): + BPF_OP_ALU_REG(reg, ins, ^, uint32_t); + break; + case (BPF_ALU | BPF_MUL | BPF_X): + BPF_OP_ALU_REG(reg, ins, *, uint32_t); + break; + case (BPF_ALU | BPF_DIV | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t); + BPF_OP_ALU_REG(reg, ins, /, uint32_t); + break; + case (BPF_ALU | BPF_MOD | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t); + BPF_OP_ALU_REG(reg, ins, %, uint32_t); + break; + case (BPF_ALU | EBPF_MOV | BPF_X): + EBPF_MOV_ALU_REG(reg, ins, uint32_t); + break; + case (BPF_ALU | BPF_NEG): + BPF_NEG_ALU(reg, ins, uint32_t); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_BE): + bpf_alu_be(reg, ins); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_LE): + bpf_alu_le(reg, ins); + break; + /* 64 bit ALU IMM operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, +, uint64_t); + break; + case (EBPF_ALU64 | BPF_SUB | BPF_K): + BPF_OP_ALU_IMM(reg, ins, -, uint64_t); + break; + case (EBPF_ALU64 | BPF_AND | BPF_K): + BPF_OP_ALU_IMM(reg, ins, &, uint64_t); + break; + case (EBPF_ALU64 | BPF_OR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, |, uint64_t); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, <<, uint64_t); + break; + case (EBPF_ALU64 | BPF_RSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, >>, uint64_t); + break; + case (EBPF_ALU64 | EBPF_ARSH | BPF_K): + BPF_OP_ALU_IMM(reg, ins, >>, int64_t); + break; + case (EBPF_ALU64 | BPF_XOR | BPF_K): + BPF_OP_ALU_IMM(reg, ins, ^, uint64_t); + break; + case (EBPF_ALU64 | BPF_MUL | BPF_K): + BPF_OP_ALU_IMM(reg, ins, *, uint64_t); + break; + case (EBPF_ALU64 | BPF_DIV | BPF_K): + BPF_OP_ALU_IMM(reg, ins, /, uint64_t); + break; + case (EBPF_ALU64 | BPF_MOD | BPF_K): + BPF_OP_ALU_IMM(reg, ins, %, uint64_t); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_K): + EBPF_MOV_ALU_IMM(reg, ins, uint64_t); + break; + /* 64 bit ALU REG operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_X): + BPF_OP_ALU_REG(reg, ins, +, uint64_t); + break; + case (EBPF_ALU64 | BPF_SUB | BPF_X): + BPF_OP_ALU_REG(reg, ins, -, uint64_t); + break; + case (EBPF_ALU64 | BPF_AND | BPF_X): + BPF_OP_ALU_REG(reg, ins, &, uint64_t); + break; + case (EBPF_ALU64 | BPF_OR | BPF_X): + BPF_OP_ALU_REG(reg, ins, |, uint64_t); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, <<, uint64_t); + break; + case (EBPF_ALU64 | BPF_RSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, >>, uint64_t); + break; + case (EBPF_ALU64 | EBPF_ARSH | BPF_X): + BPF_OP_ALU_REG(reg, ins, >>, int64_t); + break; + case (EBPF_ALU64 | BPF_XOR | BPF_X): + BPF_OP_ALU_REG(reg, ins, ^, uint64_t); + break; + case (EBPF_ALU64 | BPF_MUL | BPF_X): + BPF_OP_ALU_REG(reg, ins, *, uint64_t); + break; + case (EBPF_ALU64 | BPF_DIV | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t); + BPF_OP_ALU_REG(reg, ins, /, uint64_t); + break; + case (EBPF_ALU64 | BPF_MOD | BPF_X): + BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t); + BPF_OP_ALU_REG(reg, ins, %, uint64_t); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_X): + EBPF_MOV_ALU_REG(reg, ins, uint64_t); + break; + case (EBPF_ALU64 | BPF_NEG): + BPF_NEG_ALU(reg, ins, uint64_t); + break; + /* load instructions */ + case (BPF_LDX | BPF_MEM | BPF_B): + BPF_LD_REG(reg, ins, uint8_t); + break; + case (BPF_LDX | BPF_MEM | BPF_H): + BPF_LD_REG(reg, ins, uint16_t); + break; + case (BPF_LDX | BPF_MEM | BPF_W): + BPF_LD_REG(reg, ins, uint32_t); + break; + case (BPF_LDX | BPF_MEM | EBPF_DW): + BPF_LD_REG(reg, ins, uint64_t); + break; + /* load 64 bit immediate value */ + case (BPF_LD | BPF_IMM | EBPF_DW): + reg[ins->dst_reg] = (uint32_t)ins[0].imm | + (uint64_t)(uint32_t)ins[1].imm << 32; + ins++; + break; + /* store instructions */ + case (BPF_STX | BPF_MEM | BPF_B): + BPF_ST_REG(reg, ins, uint8_t); + break; + case (BPF_STX | BPF_MEM | BPF_H): + BPF_ST_REG(reg, ins, uint16_t); + break; + case (BPF_STX | BPF_MEM | BPF_W): + BPF_ST_REG(reg, ins, uint32_t); + break; + case (BPF_STX | BPF_MEM | EBPF_DW): + BPF_ST_REG(reg, ins, uint64_t); + break; + case (BPF_ST | BPF_MEM | BPF_B): + BPF_ST_IMM(reg, ins, uint8_t); + break; + case (BPF_ST | BPF_MEM | BPF_H): + BPF_ST_IMM(reg, ins, uint16_t); + break; + case (BPF_ST | BPF_MEM | BPF_W): + BPF_ST_IMM(reg, ins, uint32_t); + break; + case (BPF_ST | BPF_MEM | EBPF_DW): + BPF_ST_IMM(reg, ins, uint64_t); + break; + /* atomic add instructions */ + case (BPF_STX | EBPF_XADD | BPF_W): + BPF_ST_XADD_REG(reg, ins, 32); + break; + case (BPF_STX | EBPF_XADD | EBPF_DW): + BPF_ST_XADD_REG(reg, ins, 64); + break; + /* jump instructions */ + case (BPF_JMP | BPF_JA): + BPF_JMP_UNC(ins); + break; + /* jump IMM instructions */ + case (BPF_JMP | BPF_JEQ | BPF_K): + BPF_JMP_CND_IMM(reg, ins, ==, uint64_t); + break; + case (BPF_JMP | EBPF_JNE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, !=, uint64_t); + break; + case (BPF_JMP | BPF_JGT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >, uint64_t); + break; + case (BPF_JMP | EBPF_JLT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <, uint64_t); + break; + case (BPF_JMP | BPF_JGE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >=, uint64_t); + break; + case (BPF_JMP | EBPF_JLE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <=, uint64_t); + break; + case (BPF_JMP | EBPF_JSGT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >, int64_t); + break; + case (BPF_JMP | EBPF_JSLT | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <, int64_t); + break; + case (BPF_JMP | EBPF_JSGE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, >=, int64_t); + break; + case (BPF_JMP | EBPF_JSLE | BPF_K): + BPF_JMP_CND_IMM(reg, ins, <=, int64_t); + break; + case (BPF_JMP | BPF_JSET | BPF_K): + BPF_JMP_CND_IMM(reg, ins, &, uint64_t); + break; + /* jump REG instructions */ + case (BPF_JMP | BPF_JEQ | BPF_X): + BPF_JMP_CND_REG(reg, ins, ==, uint64_t); + break; + case (BPF_JMP | EBPF_JNE | BPF_X): + BPF_JMP_CND_REG(reg, ins, !=, uint64_t); + break; + case (BPF_JMP | BPF_JGT | BPF_X): + BPF_JMP_CND_REG(reg, ins, >, uint64_t); + break; + case (BPF_JMP | EBPF_JLT | BPF_X): + BPF_JMP_CND_REG(reg, ins, <, uint64_t); + break; + case (BPF_JMP | BPF_JGE | BPF_X): + BPF_JMP_CND_REG(reg, ins, >=, uint64_t); + break; + case (BPF_JMP | EBPF_JLE | BPF_X): + BPF_JMP_CND_REG(reg, ins, <=, uint64_t); + break; + case (BPF_JMP | EBPF_JSGT | BPF_X): + BPF_JMP_CND_REG(reg, ins, >, int64_t); + break; + case (BPF_JMP | EBPF_JSLT | BPF_X): + BPF_JMP_CND_REG(reg, ins, <, int64_t); + break; + case (BPF_JMP | EBPF_JSGE | BPF_X): + BPF_JMP_CND_REG(reg, ins, >=, int64_t); + break; + case (BPF_JMP | EBPF_JSLE | BPF_X): + BPF_JMP_CND_REG(reg, ins, <=, int64_t); + break; + case (BPF_JMP | BPF_JSET | BPF_X): + BPF_JMP_CND_REG(reg, ins, &, uint64_t); + break; + /* call instructions */ + case (BPF_JMP | EBPF_CALL): + reg[EBPF_REG_0] = bpf->prm.xsym[ins->imm].func.val( + reg[EBPF_REG_1], reg[EBPF_REG_2], + reg[EBPF_REG_3], reg[EBPF_REG_4], + reg[EBPF_REG_5]); + break; + /* return instruction */ + case (BPF_JMP | EBPF_EXIT): + return reg[EBPF_REG_0]; + default: + RTE_BPF_LOG(ERR, + "%s(%p): invalid opcode %#x at pc: %#zx;\n", + __func__, bpf, ins->code, + (uintptr_t)ins - (uintptr_t)bpf->prm.ins); + return 0; + } + } + + /* should never be reached */ + RTE_VERIFY(0); + return 0; +} + +__rte_experimental uint32_t +rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[], + uint32_t num) +{ + uint32_t i; + uint64_t reg[EBPF_REG_NUM]; + uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)]; + + for (i = 0; i != num; i++) { + + reg[EBPF_REG_1] = (uintptr_t)ctx[i]; + reg[EBPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack)); + + rc[i] = bpf_exec(bpf, reg); + } + + return i; +} + +__rte_experimental uint64_t +rte_bpf_exec(const struct rte_bpf *bpf, void *ctx) +{ + uint64_t rc; + + rte_bpf_exec_burst(bpf, &ctx, &rc, 1); + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_impl.h b/src/spdk/dpdk/lib/librte_bpf/bpf_impl.h new file mode 100644 index 00000000..b577e2cb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_impl.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _BPF_H_ +#define _BPF_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_BPF_STACK_SIZE 0x200 + +struct rte_bpf { + struct rte_bpf_prm prm; + struct rte_bpf_jit jit; + size_t sz; + uint32_t stack_sz; +}; + +extern int bpf_validate(struct rte_bpf *bpf); + +extern int bpf_jit(struct rte_bpf *bpf); + +#ifdef RTE_ARCH_X86_64 +extern int bpf_jit_x86(struct rte_bpf *); +#endif + +extern int rte_bpf_logtype; + +#define RTE_BPF_LOG(lvl, fmt, args...) \ + rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args) + +static inline size_t +bpf_size(uint32_t bpf_op_sz) +{ + if (bpf_op_sz == BPF_B) + return sizeof(uint8_t); + else if (bpf_op_sz == BPF_H) + return sizeof(uint16_t); + else if (bpf_op_sz == BPF_W) + return sizeof(uint32_t); + else if (bpf_op_sz == EBPF_DW) + return sizeof(uint64_t); + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _BPF_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_jit_x86.c b/src/spdk/dpdk/lib/librte_bpf/bpf_jit_x86.c new file mode 100644 index 00000000..68ea389f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_jit_x86.c @@ -0,0 +1,1356 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +#define GET_BPF_OP(op) (BPF_OP(op) >> 4) + +enum { + RAX = 0, /* scratch, return value */ + RCX = 1, /* scratch, 4th arg */ + RDX = 2, /* scratch, 3rd arg */ + RBX = 3, /* callee saved */ + RSP = 4, /* stack pointer */ + RBP = 5, /* frame pointer, callee saved */ + RSI = 6, /* scratch, 2nd arg */ + RDI = 7, /* scratch, 1st arg */ + R8 = 8, /* scratch, 5th arg */ + R9 = 9, /* scratch, 6th arg */ + R10 = 10, /* scratch */ + R11 = 11, /* scratch */ + R12 = 12, /* callee saved */ + R13 = 13, /* callee saved */ + R14 = 14, /* callee saved */ + R15 = 15, /* callee saved */ +}; + +#define IS_EXT_REG(r) ((r) >= R8) + +enum { + REX_PREFIX = 0x40, /* fixed value 0100 */ + REX_W = 0x8, /* 64bit operand size */ + REX_R = 0x4, /* extension of the ModRM.reg field */ + REX_X = 0x2, /* extension of the SIB.index field */ + REX_B = 0x1, /* extension of the ModRM.rm field */ +}; + +enum { + MOD_INDIRECT = 0, + MOD_IDISP8 = 1, + MOD_IDISP32 = 2, + MOD_DIRECT = 3, +}; + +enum { + SIB_SCALE_1 = 0, + SIB_SCALE_2 = 1, + SIB_SCALE_4 = 2, + SIB_SCALE_8 = 3, +}; + +/* + * eBPF to x86_64 register mappings. + */ +static const uint32_t ebpf2x86[] = { + [EBPF_REG_0] = RAX, + [EBPF_REG_1] = RDI, + [EBPF_REG_2] = RSI, + [EBPF_REG_3] = RDX, + [EBPF_REG_4] = RCX, + [EBPF_REG_5] = R8, + [EBPF_REG_6] = RBX, + [EBPF_REG_7] = R13, + [EBPF_REG_8] = R14, + [EBPF_REG_9] = R15, + [EBPF_REG_10] = RBP, +}; + +/* + * r10 and r11 are used as a scratch temporary registers. + */ +enum { + REG_DIV_IMM = R9, + REG_TMP0 = R11, + REG_TMP1 = R10, +}; + +/* + * callee saved registers list. + * keep RBP as the last one. + */ +static const uint32_t save_regs[] = {RBX, R12, R13, R14, R15, RBP}; + +struct bpf_jit_state { + uint32_t idx; + size_t sz; + struct { + uint32_t num; + int32_t off; + } exit; + uint32_t reguse; + int32_t *off; + uint8_t *ins; +}; + +#define INUSE(v, r) (((v) >> (r)) & 1) +#define USED(v, r) ((v) |= 1 << (r)) + +union bpf_jit_imm { + uint32_t u32; + uint8_t u8[4]; +}; + +/* + * In many cases for imm8 we can produce shorter code. + */ +static size_t +imm_size(int32_t v) +{ + if (v == (int8_t)v) + return sizeof(int8_t); + return sizeof(int32_t); +} + +static void +emit_bytes(struct bpf_jit_state *st, const uint8_t ins[], uint32_t sz) +{ + uint32_t i; + + if (st->ins != NULL) { + for (i = 0; i != sz; i++) + st->ins[st->sz + i] = ins[i]; + } + st->sz += sz; +} + +static void +emit_imm(struct bpf_jit_state *st, const uint32_t imm, uint32_t sz) +{ + union bpf_jit_imm v; + + v.u32 = imm; + emit_bytes(st, v.u8, sz); +} + +/* + * emit REX byte + */ +static void +emit_rex(struct bpf_jit_state *st, uint32_t op, uint32_t reg, uint32_t rm) +{ + uint8_t rex; + + /* mark operand registers as used*/ + USED(st->reguse, reg); + USED(st->reguse, rm); + + rex = 0; + if (BPF_CLASS(op) == EBPF_ALU64 || + op == (BPF_ST | BPF_MEM | EBPF_DW) || + op == (BPF_STX | BPF_MEM | EBPF_DW) || + op == (BPF_STX | EBPF_XADD | EBPF_DW) || + op == (BPF_LD | BPF_IMM | EBPF_DW) || + (BPF_CLASS(op) == BPF_LDX && + BPF_MODE(op) == BPF_MEM && + BPF_SIZE(op) != BPF_W)) + rex |= REX_W; + + if (IS_EXT_REG(reg)) + rex |= REX_R; + + if (IS_EXT_REG(rm)) + rex |= REX_B; + + /* store using SIL, DIL */ + if (op == (BPF_STX | BPF_MEM | BPF_B) && (reg == RDI || reg == RSI)) + rex |= REX_PREFIX; + + if (rex != 0) { + rex |= REX_PREFIX; + emit_bytes(st, &rex, sizeof(rex)); + } +} + +/* + * emit MODRegRM byte + */ +static void +emit_modregrm(struct bpf_jit_state *st, uint32_t mod, uint32_t reg, uint32_t rm) +{ + uint8_t v; + + v = mod << 6 | (reg & 7) << 3 | (rm & 7); + emit_bytes(st, &v, sizeof(v)); +} + +/* + * emit SIB byte + */ +static void +emit_sib(struct bpf_jit_state *st, uint32_t scale, uint32_t idx, uint32_t base) +{ + uint8_t v; + + v = scale << 6 | (idx & 7) << 3 | (base & 7); + emit_bytes(st, &v, sizeof(v)); +} + +/* + * emit xchg %, % + */ +static void +emit_xchg_reg(struct bpf_jit_state *st, uint32_t sreg, uint32_t dreg) +{ + const uint8_t ops = 0x87; + + emit_rex(st, EBPF_ALU64, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +/* + * emit neg % + */ +static void +emit_neg(struct bpf_jit_state *st, uint32_t op, uint32_t dreg) +{ + const uint8_t ops = 0xF7; + const uint8_t mods = 3; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); +} + +/* + * emit mov %, % + */ +static void +emit_mov_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + const uint8_t ops = 0x89; + + /* if operands are 32-bit, then it can be used to clear upper 32-bit */ + if (sreg != dreg || BPF_CLASS(op) == BPF_ALU) { + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); + } +} + +/* + * emit movzwl %, % + */ +static void +emit_movzwl(struct bpf_jit_state *st, uint32_t sreg, uint32_t dreg) +{ + static const uint8_t ops[] = {0x0F, 0xB7}; + + emit_rex(st, BPF_ALU, sreg, dreg); + emit_bytes(st, ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +/* + * emit ror , % + */ +static void +emit_ror_imm(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + const uint8_t prfx = 0x66; + const uint8_t ops = 0xC1; + const uint8_t mods = 1; + + emit_bytes(st, &prfx, sizeof(prfx)); + emit_rex(st, BPF_ALU, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); + emit_imm(st, imm, imm_size(imm)); +} + +/* + * emit bswap % + */ +static void +emit_be2le_48(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + uint32_t rop; + + const uint8_t ops = 0x0F; + const uint8_t mods = 1; + + rop = (imm == 64) ? EBPF_ALU64 : BPF_ALU; + emit_rex(st, rop, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); +} + +static void +emit_be2le(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + if (imm == 16) { + emit_ror_imm(st, dreg, 8); + emit_movzwl(st, dreg, dreg); + } else + emit_be2le_48(st, dreg, imm); +} + +/* + * In general it is NOP for x86. + * Just clear the upper bits. + */ +static void +emit_le2be(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm) +{ + if (imm == 16) + emit_movzwl(st, dreg, dreg); + else if (imm == 32) + emit_mov_reg(st, BPF_ALU | EBPF_MOV | BPF_X, dreg, dreg); +} + +/* + * emit one of: + * add , % + * and , % + * or , % + * sub , % + * xor , % + */ +static void +emit_alu_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + uint8_t mod, opcode; + uint32_t bop, imsz; + + const uint8_t op8 = 0x83; + const uint8_t op32 = 0x81; + static const uint8_t mods[] = { + [GET_BPF_OP(BPF_ADD)] = 0, + [GET_BPF_OP(BPF_AND)] = 4, + [GET_BPF_OP(BPF_OR)] = 1, + [GET_BPF_OP(BPF_SUB)] = 5, + [GET_BPF_OP(BPF_XOR)] = 6, + }; + + bop = GET_BPF_OP(op); + mod = mods[bop]; + + imsz = imm_size(imm); + opcode = (imsz == 1) ? op8 : op32; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &opcode, sizeof(opcode)); + emit_modregrm(st, MOD_DIRECT, mod, dreg); + emit_imm(st, imm, imsz); +} + +/* + * emit one of: + * add %, % + * and %, % + * or %, % + * sub %, % + * xor %, % + */ +static void +emit_alu_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + uint32_t bop; + + static const uint8_t ops[] = { + [GET_BPF_OP(BPF_ADD)] = 0x01, + [GET_BPF_OP(BPF_AND)] = 0x21, + [GET_BPF_OP(BPF_OR)] = 0x09, + [GET_BPF_OP(BPF_SUB)] = 0x29, + [GET_BPF_OP(BPF_XOR)] = 0x31, + }; + + bop = GET_BPF_OP(op); + + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops[bop], sizeof(ops[bop])); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +static void +emit_shift(struct bpf_jit_state *st, uint32_t op, uint32_t dreg) +{ + uint8_t mod; + uint32_t bop, opx; + + static const uint8_t ops[] = {0xC1, 0xD3}; + static const uint8_t mods[] = { + [GET_BPF_OP(BPF_LSH)] = 4, + [GET_BPF_OP(BPF_RSH)] = 5, + [GET_BPF_OP(EBPF_ARSH)] = 7, + }; + + bop = GET_BPF_OP(op); + mod = mods[bop]; + opx = (BPF_SRC(op) == BPF_X); + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops[opx], sizeof(ops[opx])); + emit_modregrm(st, MOD_DIRECT, mod, dreg); +} + +/* + * emit one of: + * shl , % + * shr , % + * sar , % + */ +static void +emit_shift_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, + uint32_t imm) +{ + emit_shift(st, op, dreg); + emit_imm(st, imm, imm_size(imm)); +} + +/* + * emit one of: + * shl % + * shr % + * sar % + * note that rcx is implicitly used as a source register, so few extra + * instructions for register spillage might be necessary. + */ +static void +emit_shift_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + if (sreg != RCX) + emit_xchg_reg(st, RCX, sreg); + + emit_shift(st, op, (dreg == RCX) ? sreg : dreg); + + if (sreg != RCX) + emit_xchg_reg(st, RCX, sreg); +} + +/* + * emit mov , % + */ +static void +emit_mov_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + const uint8_t ops = 0xC7; + + if (imm == 0) { + /* replace 'mov 0, %' with 'xor %, %' */ + op = BPF_CLASS(op) | BPF_XOR | BPF_X; + emit_alu_reg(st, op, dreg, dreg); + return; + } + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, 0, dreg); + emit_imm(st, imm, sizeof(imm)); +} + +/* + * emit mov , % + */ +static void +emit_ld_imm64(struct bpf_jit_state *st, uint32_t dreg, uint32_t imm0, + uint32_t imm1) +{ + const uint8_t ops = 0xB8; + + if (imm1 == 0) { + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, dreg, imm0); + return; + } + + emit_rex(st, EBPF_ALU64, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, 0, dreg); + + emit_imm(st, imm0, sizeof(imm0)); + emit_imm(st, imm1, sizeof(imm1)); +} + +/* + * note that rax:rdx are implicitly used as source/destination registers, + * so some reg spillage is necessary. + * emit: + * mov %rax, %r11 + * mov %rdx, %r10 + * mov %, %rax + * either: + * mov %, %rdx + * OR + * mov , %rdx + * mul %rdx + * mov %r10, %rdx + * mov %rax, % + * mov %r11, %rax + */ +static void +emit_mul(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + uint32_t imm) +{ + const uint8_t ops = 0xF7; + const uint8_t mods = 4; + + /* save rax & rdx */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, REG_TMP0); + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RDX, REG_TMP1); + + /* rax = dreg */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, dreg, RAX); + + if (BPF_SRC(op) == BPF_X) + /* rdx = sreg */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, + sreg == RAX ? REG_TMP0 : sreg, RDX); + else + /* rdx = imm */ + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, RDX, imm); + + emit_rex(st, op, RAX, RDX); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, RDX); + + if (dreg != RDX) + /* restore rdx */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP1, RDX); + + if (dreg != RAX) { + /* dreg = rax */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, dreg); + /* restore rax */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP0, RAX); + } +} + +/* + * emit mov (%), % + * note that for non 64-bit ops, higher bits have to be cleared. + */ +static void +emit_ld_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + int32_t ofs) +{ + uint32_t mods, opsz; + const uint8_t op32 = 0x8B; + const uint8_t op16[] = {0x0F, 0xB7}; + const uint8_t op8[] = {0x0F, 0xB6}; + + emit_rex(st, op, dreg, sreg); + + opsz = BPF_SIZE(op); + if (opsz == BPF_B) + emit_bytes(st, op8, sizeof(op8)); + else if (opsz == BPF_H) + emit_bytes(st, op16, sizeof(op16)); + else + emit_bytes(st, &op32, sizeof(op32)); + + mods = (imm_size(ofs) == 1) ? MOD_IDISP8 : MOD_IDISP32; + + emit_modregrm(st, mods, dreg, sreg); + if (sreg == RSP || sreg == R12) + emit_sib(st, SIB_SCALE_1, sreg, sreg); + emit_imm(st, ofs, imm_size(ofs)); +} + +/* + * emit one of: + * mov %, (%) + * mov , (%) + */ +static void +emit_st_common(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg, uint32_t imm, int32_t ofs) +{ + uint32_t mods, imsz, opsz, opx; + const uint8_t prfx16 = 0x66; + + /* 8 bit instruction opcodes */ + static const uint8_t op8[] = {0xC6, 0x88}; + + /* 16/32/64 bit instruction opcodes */ + static const uint8_t ops[] = {0xC7, 0x89}; + + /* is the instruction has immediate value or src reg? */ + opx = (BPF_CLASS(op) == BPF_STX); + + opsz = BPF_SIZE(op); + if (opsz == BPF_H) + emit_bytes(st, &prfx16, sizeof(prfx16)); + + emit_rex(st, op, sreg, dreg); + + if (opsz == BPF_B) + emit_bytes(st, &op8[opx], sizeof(op8[opx])); + else + emit_bytes(st, &ops[opx], sizeof(ops[opx])); + + imsz = imm_size(ofs); + mods = (imsz == 1) ? MOD_IDISP8 : MOD_IDISP32; + + emit_modregrm(st, mods, sreg, dreg); + + if (dreg == RSP || dreg == R12) + emit_sib(st, SIB_SCALE_1, dreg, dreg); + + emit_imm(st, ofs, imsz); + + if (opx == 0) { + imsz = RTE_MIN(bpf_size(opsz), sizeof(imm)); + emit_imm(st, imm, imsz); + } +} + +static void +emit_st_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm, + int32_t ofs) +{ + emit_st_common(st, op, 0, dreg, imm, ofs); +} + +static void +emit_st_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + int32_t ofs) +{ + emit_st_common(st, op, sreg, dreg, 0, ofs); +} + +/* + * emit lock add %, (%) + */ +static void +emit_st_xadd(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg, int32_t ofs) +{ + uint32_t imsz, mods; + + const uint8_t lck = 0xF0; /* lock prefix */ + const uint8_t ops = 0x01; /* add opcode */ + + imsz = imm_size(ofs); + mods = (imsz == 1) ? MOD_IDISP8 : MOD_IDISP32; + + emit_bytes(st, &lck, sizeof(lck)); + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, mods, sreg, dreg); + emit_imm(st, ofs, imsz); +} + +/* + * emit: + * mov , (%rax) + * call *%rax + */ +static void +emit_call(struct bpf_jit_state *st, uintptr_t trg) +{ + const uint8_t ops = 0xFF; + const uint8_t mods = 2; + + emit_ld_imm64(st, RAX, trg, trg >> 32); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, RAX); +} + +/* + * emit jmp + * where 'ofs' is the target offset for the native code. + */ +static void +emit_abs_jmp(struct bpf_jit_state *st, int32_t ofs) +{ + int32_t joff; + uint32_t imsz; + + const uint8_t op8 = 0xEB; + const uint8_t op32 = 0xE9; + + const int32_t sz8 = sizeof(op8) + sizeof(uint8_t); + const int32_t sz32 = sizeof(op32) + sizeof(uint32_t); + + /* max possible jmp instruction size */ + const int32_t iszm = RTE_MAX(sz8, sz32); + + joff = ofs - st->sz; + imsz = RTE_MAX(imm_size(joff), imm_size(joff + iszm)); + + if (imsz == 1) { + emit_bytes(st, &op8, sizeof(op8)); + joff -= sz8; + } else { + emit_bytes(st, &op32, sizeof(op32)); + joff -= sz32; + } + + emit_imm(st, joff, imsz); +} + +/* + * emit jmp + * where 'ofs' is the target offset for the BPF bytecode. + */ +static void +emit_jmp(struct bpf_jit_state *st, int32_t ofs) +{ + emit_abs_jmp(st, st->off[st->idx + ofs]); +} + +/* + * emit one of: + * cmovz %, <%dreg> + * cmovne %, <%dreg> + * cmova %, <%dreg> + * cmovb %, <%dreg> + * cmovae %, <%dreg> + * cmovbe %, <%dreg> + * cmovg %, <%dreg> + * cmovl %, <%dreg> + * cmovge %, <%dreg> + * cmovle %, <%dreg> + */ +static void +emit_movcc_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + uint32_t bop; + + static const uint8_t ops[][2] = { + [GET_BPF_OP(BPF_JEQ)] = {0x0F, 0x44}, /* CMOVZ */ + [GET_BPF_OP(EBPF_JNE)] = {0x0F, 0x45}, /* CMOVNE */ + [GET_BPF_OP(BPF_JGT)] = {0x0F, 0x47}, /* CMOVA */ + [GET_BPF_OP(EBPF_JLT)] = {0x0F, 0x42}, /* CMOVB */ + [GET_BPF_OP(BPF_JGE)] = {0x0F, 0x43}, /* CMOVAE */ + [GET_BPF_OP(EBPF_JLE)] = {0x0F, 0x46}, /* CMOVBE */ + [GET_BPF_OP(EBPF_JSGT)] = {0x0F, 0x4F}, /* CMOVG */ + [GET_BPF_OP(EBPF_JSLT)] = {0x0F, 0x4C}, /* CMOVL */ + [GET_BPF_OP(EBPF_JSGE)] = {0x0F, 0x4D}, /* CMOVGE */ + [GET_BPF_OP(EBPF_JSLE)] = {0x0F, 0x4E}, /* CMOVLE */ + [GET_BPF_OP(BPF_JSET)] = {0x0F, 0x45}, /* CMOVNE */ + }; + + bop = GET_BPF_OP(op); + + emit_rex(st, op, dreg, sreg); + emit_bytes(st, ops[bop], sizeof(ops[bop])); + emit_modregrm(st, MOD_DIRECT, dreg, sreg); +} + +/* + * emit one of: + * je + * jne + * ja + * jb + * jae + * jbe + * jg + * jl + * jge + * jle + * where 'ofs' is the target offset for the native code. + */ +static void +emit_abs_jcc(struct bpf_jit_state *st, uint32_t op, int32_t ofs) +{ + uint32_t bop, imsz; + int32_t joff; + + static const uint8_t op8[] = { + [GET_BPF_OP(BPF_JEQ)] = 0x74, /* JE */ + [GET_BPF_OP(EBPF_JNE)] = 0x75, /* JNE */ + [GET_BPF_OP(BPF_JGT)] = 0x77, /* JA */ + [GET_BPF_OP(EBPF_JLT)] = 0x72, /* JB */ + [GET_BPF_OP(BPF_JGE)] = 0x73, /* JAE */ + [GET_BPF_OP(EBPF_JLE)] = 0x76, /* JBE */ + [GET_BPF_OP(EBPF_JSGT)] = 0x7F, /* JG */ + [GET_BPF_OP(EBPF_JSLT)] = 0x7C, /* JL */ + [GET_BPF_OP(EBPF_JSGE)] = 0x7D, /*JGE */ + [GET_BPF_OP(EBPF_JSLE)] = 0x7E, /* JLE */ + [GET_BPF_OP(BPF_JSET)] = 0x75, /*JNE */ + }; + + static const uint8_t op32[][2] = { + [GET_BPF_OP(BPF_JEQ)] = {0x0F, 0x84}, /* JE */ + [GET_BPF_OP(EBPF_JNE)] = {0x0F, 0x85}, /* JNE */ + [GET_BPF_OP(BPF_JGT)] = {0x0F, 0x87}, /* JA */ + [GET_BPF_OP(EBPF_JLT)] = {0x0F, 0x82}, /* JB */ + [GET_BPF_OP(BPF_JGE)] = {0x0F, 0x83}, /* JAE */ + [GET_BPF_OP(EBPF_JLE)] = {0x0F, 0x86}, /* JBE */ + [GET_BPF_OP(EBPF_JSGT)] = {0x0F, 0x8F}, /* JG */ + [GET_BPF_OP(EBPF_JSLT)] = {0x0F, 0x8C}, /* JL */ + [GET_BPF_OP(EBPF_JSGE)] = {0x0F, 0x8D}, /*JGE */ + [GET_BPF_OP(EBPF_JSLE)] = {0x0F, 0x8E}, /* JLE */ + [GET_BPF_OP(BPF_JSET)] = {0x0F, 0x85}, /*JNE */ + }; + + const int32_t sz8 = sizeof(op8[0]) + sizeof(uint8_t); + const int32_t sz32 = sizeof(op32[0]) + sizeof(uint32_t); + + /* max possible jcc instruction size */ + const int32_t iszm = RTE_MAX(sz8, sz32); + + joff = ofs - st->sz; + imsz = RTE_MAX(imm_size(joff), imm_size(joff + iszm)); + + bop = GET_BPF_OP(op); + + if (imsz == 1) { + emit_bytes(st, &op8[bop], sizeof(op8[bop])); + joff -= sz8; + } else { + emit_bytes(st, op32[bop], sizeof(op32[bop])); + joff -= sz32; + } + + emit_imm(st, joff, imsz); +} + +/* + * emit one of: + * je + * jne + * ja + * jb + * jae + * jbe + * jg + * jl + * jge + * jle + * where 'ofs' is the target offset for the BPF bytecode. + */ +static void +emit_jcc(struct bpf_jit_state *st, uint32_t op, int32_t ofs) +{ + emit_abs_jcc(st, op, st->off[st->idx + ofs]); +} + + +/* + * emit cmp , % + */ +static void +emit_cmp_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + uint8_t ops; + uint32_t imsz; + + const uint8_t op8 = 0x83; + const uint8_t op32 = 0x81; + const uint8_t mods = 7; + + imsz = imm_size(imm); + ops = (imsz == 1) ? op8 : op32; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); + emit_imm(st, imm, imsz); +} + +/* + * emit test , % + */ +static void +emit_tst_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, uint32_t imm) +{ + const uint8_t ops = 0xF7; + const uint8_t mods = 0; + + emit_rex(st, op, 0, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, dreg); + emit_imm(st, imm, imm_size(imm)); +} + +static void +emit_jcc_imm(struct bpf_jit_state *st, uint32_t op, uint32_t dreg, + uint32_t imm, int32_t ofs) +{ + if (BPF_OP(op) == BPF_JSET) + emit_tst_imm(st, EBPF_ALU64, dreg, imm); + else + emit_cmp_imm(st, EBPF_ALU64, dreg, imm); + + emit_jcc(st, op, ofs); +} + +/* + * emit test %, % + */ +static void +emit_tst_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + const uint8_t ops = 0x85; + + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); +} + +/* + * emit cmp %, % + */ +static void +emit_cmp_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg) +{ + const uint8_t ops = 0x39; + + emit_rex(st, op, sreg, dreg); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, sreg, dreg); + +} + +static void +emit_jcc_reg(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, + uint32_t dreg, int32_t ofs) +{ + if (BPF_OP(op) == BPF_JSET) + emit_tst_reg(st, EBPF_ALU64, sreg, dreg); + else + emit_cmp_reg(st, EBPF_ALU64, sreg, dreg); + + emit_jcc(st, op, ofs); +} + +/* + * note that rax:rdx are implicitly used as source/destination registers, + * so some reg spillage is necessary. + * emit: + * mov %rax, %r11 + * mov %rdx, %r10 + * mov %, %rax + * xor %rdx, %rdx + * for divisor as immediate value: + * mov , %r9 + * div % + * mov %r10, %rdx + * mov %rax, % + * mov %r11, %rax + * either: + * mov %rax, % + * OR + * mov %rdx, % + * mov %r11, %rax + * mov %r10, %rdx + */ +static void +emit_div(struct bpf_jit_state *st, uint32_t op, uint32_t sreg, uint32_t dreg, + uint32_t imm) +{ + uint32_t sr; + + const uint8_t ops = 0xF7; + const uint8_t mods = 6; + + if (BPF_SRC(op) == BPF_X) { + + /* check that src divisor is not zero */ + emit_tst_reg(st, BPF_CLASS(op), sreg, sreg); + + /* exit with return value zero */ + emit_movcc_reg(st, BPF_CLASS(op) | BPF_JEQ | BPF_X, sreg, RAX); + emit_abs_jcc(st, BPF_JMP | BPF_JEQ | BPF_K, st->exit.off); + } + + /* save rax & rdx */ + if (dreg != RAX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, REG_TMP0); + if (dreg != RDX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RDX, REG_TMP1); + + /* fill rax & rdx */ + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, dreg, RAX); + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, RDX, 0); + + if (BPF_SRC(op) == BPF_X) { + sr = sreg; + if (sr == RAX) + sr = REG_TMP0; + else if (sr == RDX) + sr = REG_TMP1; + } else { + sr = REG_DIV_IMM; + emit_mov_imm(st, EBPF_ALU64 | EBPF_MOV | BPF_K, sr, imm); + } + + emit_rex(st, op, 0, sr); + emit_bytes(st, &ops, sizeof(ops)); + emit_modregrm(st, MOD_DIRECT, mods, sr); + + if (BPF_OP(op) == BPF_DIV) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RAX, dreg); + else + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RDX, dreg); + + if (dreg != RAX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP0, RAX); + if (dreg != RDX) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, REG_TMP1, RDX); +} + +static void +emit_prolog(struct bpf_jit_state *st, int32_t stack_size) +{ + uint32_t i; + int32_t spil, ofs; + + spil = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) + spil += INUSE(st->reguse, save_regs[i]); + + /* we can avoid touching the stack at all */ + if (spil == 0) + return; + + + emit_alu_imm(st, EBPF_ALU64 | BPF_SUB | BPF_K, RSP, + spil * sizeof(uint64_t)); + + ofs = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) { + if (INUSE(st->reguse, save_regs[i]) != 0) { + emit_st_reg(st, BPF_STX | BPF_MEM | EBPF_DW, + save_regs[i], RSP, ofs); + ofs += sizeof(uint64_t); + } + } + + if (INUSE(st->reguse, RBP) != 0) { + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RSP, RBP); + emit_alu_imm(st, EBPF_ALU64 | BPF_SUB | BPF_K, RSP, stack_size); + } +} + +/* + * emit ret + */ +static void +emit_ret(struct bpf_jit_state *st) +{ + const uint8_t ops = 0xC3; + + emit_bytes(st, &ops, sizeof(ops)); +} + +static void +emit_epilog(struct bpf_jit_state *st) +{ + uint32_t i; + int32_t spil, ofs; + + /* if we allready have an epilog generate a jump to it */ + if (st->exit.num++ != 0) { + emit_abs_jmp(st, st->exit.off); + return; + } + + /* store offset of epilog block */ + st->exit.off = st->sz; + + spil = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) + spil += INUSE(st->reguse, save_regs[i]); + + if (spil != 0) { + + if (INUSE(st->reguse, RBP) != 0) + emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, + RBP, RSP); + + ofs = 0; + for (i = 0; i != RTE_DIM(save_regs); i++) { + if (INUSE(st->reguse, save_regs[i]) != 0) { + emit_ld_reg(st, BPF_LDX | BPF_MEM | EBPF_DW, + RSP, save_regs[i], ofs); + ofs += sizeof(uint64_t); + } + } + + emit_alu_imm(st, EBPF_ALU64 | BPF_ADD | BPF_K, RSP, + spil * sizeof(uint64_t)); + } + + emit_ret(st); +} + +/* + * walk through bpf code and translate them x86_64 one. + */ +static int +emit(struct bpf_jit_state *st, const struct rte_bpf *bpf) +{ + uint32_t i, dr, op, sr; + const struct ebpf_insn *ins; + + /* reset state fields */ + st->sz = 0; + st->exit.num = 0; + + emit_prolog(st, bpf->stack_sz); + + for (i = 0; i != bpf->prm.nb_ins; i++) { + + st->idx = i; + st->off[i] = st->sz; + + ins = bpf->prm.ins + i; + + dr = ebpf2x86[ins->dst_reg]; + sr = ebpf2x86[ins->src_reg]; + op = ins->code; + + switch (op) { + /* 32 bit ALU IMM operations */ + case (BPF_ALU | BPF_ADD | BPF_K): + case (BPF_ALU | BPF_SUB | BPF_K): + case (BPF_ALU | BPF_AND | BPF_K): + case (BPF_ALU | BPF_OR | BPF_K): + case (BPF_ALU | BPF_XOR | BPF_K): + emit_alu_imm(st, op, dr, ins->imm); + break; + case (BPF_ALU | BPF_LSH | BPF_K): + case (BPF_ALU | BPF_RSH | BPF_K): + emit_shift_imm(st, op, dr, ins->imm); + break; + case (BPF_ALU | EBPF_MOV | BPF_K): + emit_mov_imm(st, op, dr, ins->imm); + break; + /* 32 bit ALU REG operations */ + case (BPF_ALU | BPF_ADD | BPF_X): + case (BPF_ALU | BPF_SUB | BPF_X): + case (BPF_ALU | BPF_AND | BPF_X): + case (BPF_ALU | BPF_OR | BPF_X): + case (BPF_ALU | BPF_XOR | BPF_X): + emit_alu_reg(st, op, sr, dr); + break; + case (BPF_ALU | BPF_LSH | BPF_X): + case (BPF_ALU | BPF_RSH | BPF_X): + emit_shift_reg(st, op, sr, dr); + break; + case (BPF_ALU | EBPF_MOV | BPF_X): + emit_mov_reg(st, op, sr, dr); + break; + case (BPF_ALU | BPF_NEG): + emit_neg(st, op, dr); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_BE): + emit_be2le(st, dr, ins->imm); + break; + case (BPF_ALU | EBPF_END | EBPF_TO_LE): + emit_le2be(st, dr, ins->imm); + break; + /* 64 bit ALU IMM operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_K): + case (EBPF_ALU64 | BPF_SUB | BPF_K): + case (EBPF_ALU64 | BPF_AND | BPF_K): + case (EBPF_ALU64 | BPF_OR | BPF_K): + case (EBPF_ALU64 | BPF_XOR | BPF_K): + emit_alu_imm(st, op, dr, ins->imm); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_K): + case (EBPF_ALU64 | BPF_RSH | BPF_K): + case (EBPF_ALU64 | EBPF_ARSH | BPF_K): + emit_shift_imm(st, op, dr, ins->imm); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_K): + emit_mov_imm(st, op, dr, ins->imm); + break; + /* 64 bit ALU REG operations */ + case (EBPF_ALU64 | BPF_ADD | BPF_X): + case (EBPF_ALU64 | BPF_SUB | BPF_X): + case (EBPF_ALU64 | BPF_AND | BPF_X): + case (EBPF_ALU64 | BPF_OR | BPF_X): + case (EBPF_ALU64 | BPF_XOR | BPF_X): + emit_alu_reg(st, op, sr, dr); + break; + case (EBPF_ALU64 | BPF_LSH | BPF_X): + case (EBPF_ALU64 | BPF_RSH | BPF_X): + case (EBPF_ALU64 | EBPF_ARSH | BPF_X): + emit_shift_reg(st, op, sr, dr); + break; + case (EBPF_ALU64 | EBPF_MOV | BPF_X): + emit_mov_reg(st, op, sr, dr); + break; + case (EBPF_ALU64 | BPF_NEG): + emit_neg(st, op, dr); + break; + /* multiply instructions */ + case (BPF_ALU | BPF_MUL | BPF_K): + case (BPF_ALU | BPF_MUL | BPF_X): + case (EBPF_ALU64 | BPF_MUL | BPF_K): + case (EBPF_ALU64 | BPF_MUL | BPF_X): + emit_mul(st, op, sr, dr, ins->imm); + break; + /* divide instructions */ + case (BPF_ALU | BPF_DIV | BPF_K): + case (BPF_ALU | BPF_MOD | BPF_K): + case (BPF_ALU | BPF_DIV | BPF_X): + case (BPF_ALU | BPF_MOD | BPF_X): + case (EBPF_ALU64 | BPF_DIV | BPF_K): + case (EBPF_ALU64 | BPF_MOD | BPF_K): + case (EBPF_ALU64 | BPF_DIV | BPF_X): + case (EBPF_ALU64 | BPF_MOD | BPF_X): + emit_div(st, op, sr, dr, ins->imm); + break; + /* load instructions */ + case (BPF_LDX | BPF_MEM | BPF_B): + case (BPF_LDX | BPF_MEM | BPF_H): + case (BPF_LDX | BPF_MEM | BPF_W): + case (BPF_LDX | BPF_MEM | EBPF_DW): + emit_ld_reg(st, op, sr, dr, ins->off); + break; + /* load 64 bit immediate value */ + case (BPF_LD | BPF_IMM | EBPF_DW): + emit_ld_imm64(st, dr, ins[0].imm, ins[1].imm); + i++; + break; + /* store instructions */ + case (BPF_STX | BPF_MEM | BPF_B): + case (BPF_STX | BPF_MEM | BPF_H): + case (BPF_STX | BPF_MEM | BPF_W): + case (BPF_STX | BPF_MEM | EBPF_DW): + emit_st_reg(st, op, sr, dr, ins->off); + break; + case (BPF_ST | BPF_MEM | BPF_B): + case (BPF_ST | BPF_MEM | BPF_H): + case (BPF_ST | BPF_MEM | BPF_W): + case (BPF_ST | BPF_MEM | EBPF_DW): + emit_st_imm(st, op, dr, ins->imm, ins->off); + break; + /* atomic add instructions */ + case (BPF_STX | EBPF_XADD | BPF_W): + case (BPF_STX | EBPF_XADD | EBPF_DW): + emit_st_xadd(st, op, sr, dr, ins->off); + break; + /* jump instructions */ + case (BPF_JMP | BPF_JA): + emit_jmp(st, ins->off + 1); + break; + /* jump IMM instructions */ + case (BPF_JMP | BPF_JEQ | BPF_K): + case (BPF_JMP | EBPF_JNE | BPF_K): + case (BPF_JMP | BPF_JGT | BPF_K): + case (BPF_JMP | EBPF_JLT | BPF_K): + case (BPF_JMP | BPF_JGE | BPF_K): + case (BPF_JMP | EBPF_JLE | BPF_K): + case (BPF_JMP | EBPF_JSGT | BPF_K): + case (BPF_JMP | EBPF_JSLT | BPF_K): + case (BPF_JMP | EBPF_JSGE | BPF_K): + case (BPF_JMP | EBPF_JSLE | BPF_K): + case (BPF_JMP | BPF_JSET | BPF_K): + emit_jcc_imm(st, op, dr, ins->imm, ins->off + 1); + break; + /* jump REG instructions */ + case (BPF_JMP | BPF_JEQ | BPF_X): + case (BPF_JMP | EBPF_JNE | BPF_X): + case (BPF_JMP | BPF_JGT | BPF_X): + case (BPF_JMP | EBPF_JLT | BPF_X): + case (BPF_JMP | BPF_JGE | BPF_X): + case (BPF_JMP | EBPF_JLE | BPF_X): + case (BPF_JMP | EBPF_JSGT | BPF_X): + case (BPF_JMP | EBPF_JSLT | BPF_X): + case (BPF_JMP | EBPF_JSGE | BPF_X): + case (BPF_JMP | EBPF_JSLE | BPF_X): + case (BPF_JMP | BPF_JSET | BPF_X): + emit_jcc_reg(st, op, sr, dr, ins->off + 1); + break; + /* call instructions */ + case (BPF_JMP | EBPF_CALL): + emit_call(st, + (uintptr_t)bpf->prm.xsym[ins->imm].func.val); + break; + /* return instruction */ + case (BPF_JMP | EBPF_EXIT): + emit_epilog(st); + break; + default: + RTE_BPF_LOG(ERR, + "%s(%p): invalid opcode %#x at pc: %u;\n", + __func__, bpf, ins->code, i); + return -EINVAL; + } + } + + return 0; +} + +/* + * produce a native ISA version of the given BPF code. + */ +int +bpf_jit_x86(struct rte_bpf *bpf) +{ + int32_t rc; + uint32_t i; + size_t sz; + struct bpf_jit_state st; + + /* init state */ + memset(&st, 0, sizeof(st)); + st.off = malloc(bpf->prm.nb_ins * sizeof(st.off[0])); + if (st.off == NULL) + return -ENOMEM; + + /* fill with fake offsets */ + st.exit.off = INT32_MAX; + for (i = 0; i != bpf->prm.nb_ins; i++) + st.off[i] = INT32_MAX; + + /* + * dry runs, used to calculate total code size and valid jump offsets. + * stop when we get minimal possible size + */ + do { + sz = st.sz; + rc = emit(&st, bpf); + } while (rc == 0 && sz != st.sz); + + if (rc == 0) { + + /* allocate memory needed */ + st.ins = mmap(NULL, st.sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (st.ins == MAP_FAILED) + rc = -ENOMEM; + else + /* generate code */ + rc = emit(&st, bpf); + } + + if (rc == 0 && mprotect(st.ins, st.sz, PROT_READ | PROT_EXEC) != 0) + rc = -ENOMEM; + + if (rc != 0) + munmap(st.ins, st.sz); + else { + bpf->jit.func = (void *)st.ins; + bpf->jit.sz = st.sz; + } + + free(st.off); + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_load.c b/src/spdk/dpdk/lib/librte_bpf/bpf_load.c new file mode 100644 index 00000000..2b84fe72 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_load.c @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +static struct rte_bpf * +bpf_load(const struct rte_bpf_prm *prm) +{ + uint8_t *buf; + struct rte_bpf *bpf; + size_t sz, bsz, insz, xsz; + + xsz = prm->nb_xsym * sizeof(prm->xsym[0]); + insz = prm->nb_ins * sizeof(prm->ins[0]); + bsz = sizeof(bpf[0]); + sz = insz + xsz + bsz; + + buf = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf == MAP_FAILED) + return NULL; + + bpf = (void *)buf; + bpf->sz = sz; + + memcpy(&bpf->prm, prm, sizeof(bpf->prm)); + + memcpy(buf + bsz, prm->xsym, xsz); + memcpy(buf + bsz + xsz, prm->ins, insz); + + bpf->prm.xsym = (void *)(buf + bsz); + bpf->prm.ins = (void *)(buf + bsz + xsz); + + return bpf; +} + +/* + * Check that user provided external symbol. + */ +static int +bpf_check_xsym(const struct rte_bpf_xsym *xsym) +{ + uint32_t i; + + if (xsym->name == NULL) + return -EINVAL; + + if (xsym->type == RTE_BPF_XTYPE_VAR) { + if (xsym->var.desc.type == RTE_BPF_ARG_UNDEF) + return -EINVAL; + } else if (xsym->type == RTE_BPF_XTYPE_FUNC) { + + if (xsym->func.nb_args > EBPF_FUNC_MAX_ARGS) + return -EINVAL; + + /* check function arguments */ + for (i = 0; i != xsym->func.nb_args; i++) { + if (xsym->func.args[i].type == RTE_BPF_ARG_UNDEF) + return -EINVAL; + } + + /* check return value info */ + if (xsym->func.ret.type != RTE_BPF_ARG_UNDEF && + xsym->func.ret.size == 0) + return -EINVAL; + } else + return -EINVAL; + + return 0; +} + +__rte_experimental struct rte_bpf * +rte_bpf_load(const struct rte_bpf_prm *prm) +{ + struct rte_bpf *bpf; + int32_t rc; + uint32_t i; + + if (prm == NULL || prm->ins == NULL || + (prm->nb_xsym != 0 && prm->xsym == NULL)) { + rte_errno = EINVAL; + return NULL; + } + + rc = 0; + for (i = 0; i != prm->nb_xsym && rc == 0; i++) + rc = bpf_check_xsym(prm->xsym + i); + + if (rc != 0) { + rte_errno = -rc; + RTE_BPF_LOG(ERR, "%s: %d-th xsym is invalid\n", __func__, i); + return NULL; + } + + bpf = bpf_load(prm); + if (bpf == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + rc = bpf_validate(bpf); + if (rc == 0) { + bpf_jit(bpf); + if (mprotect(bpf, bpf->sz, PROT_READ) != 0) + rc = -ENOMEM; + } + + if (rc != 0) { + rte_bpf_destroy(bpf); + rte_errno = -rc; + return NULL; + } + + return bpf; +} + +__rte_experimental __attribute__ ((weak)) struct rte_bpf * +rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, + const char *sname) +{ + if (prm == NULL || fname == NULL || sname == NULL) { + rte_errno = EINVAL; + return NULL; + } + + RTE_BPF_LOG(ERR, "%s() is not supported with current config\n" + "rebuild with libelf installed\n", + __func__); + rte_errno = ENOTSUP; + return NULL; +} diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_load_elf.c b/src/spdk/dpdk/lib/librte_bpf/bpf_load_elf.c new file mode 100644 index 00000000..96d3630f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_load_elf.c @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_impl.h" + +/* To overcome compatibility issue */ +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +static uint32_t +bpf_find_xsym(const char *sn, enum rte_bpf_xtype type, + const struct rte_bpf_xsym fp[], uint32_t fn) +{ + uint32_t i; + + if (sn == NULL || fp == NULL) + return UINT32_MAX; + + for (i = 0; i != fn; i++) { + if (fp[i].type == type && strcmp(sn, fp[i].name) == 0) + break; + } + + return (i != fn) ? i : UINT32_MAX; +} + +/* + * update BPF code at offset *ofs* with a proper address(index) for external + * symbol *sn* + */ +static int +resolve_xsym(const char *sn, size_t ofs, struct ebpf_insn *ins, size_t ins_sz, + const struct rte_bpf_prm *prm) +{ + uint32_t idx, fidx; + enum rte_bpf_xtype type; + + if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz) + return -EINVAL; + + idx = ofs / sizeof(ins[0]); + if (ins[idx].code == (BPF_JMP | EBPF_CALL)) + type = RTE_BPF_XTYPE_FUNC; + else if (ins[idx].code == (BPF_LD | BPF_IMM | EBPF_DW) && + ofs < ins_sz - sizeof(ins[idx])) + type = RTE_BPF_XTYPE_VAR; + else + return -EINVAL; + + fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym); + if (fidx == UINT32_MAX) + return -ENOENT; + + /* for function we just need an index in our xsym table */ + if (type == RTE_BPF_XTYPE_FUNC) + ins[idx].imm = fidx; + /* for variable we need to store its absolute address */ + else { + ins[idx].imm = (uintptr_t)prm->xsym[fidx].var.val; + ins[idx + 1].imm = + (uint64_t)(uintptr_t)prm->xsym[fidx].var.val >> 32; + } + + return 0; +} + +static int +check_elf_header(const Elf64_Ehdr *eh) +{ + const char *err; + + err = NULL; + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + if (eh->e_ident[EI_DATA] != ELFDATA2LSB) +#else + if (eh->e_ident[EI_DATA] != ELFDATA2MSB) +#endif + err = "not native byte order"; + else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE) + err = "unexpected OS ABI"; + else if (eh->e_type != ET_REL) + err = "unexpected ELF type"; + else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF) + err = "unexpected machine type"; + + if (err != NULL) { + RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err); + return -EINVAL; + } + + return 0; +} + +/* + * helper function, find executable section by name. + */ +static int +find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx) +{ + Elf_Scn *sc; + const Elf64_Ehdr *eh; + const Elf64_Shdr *sh; + Elf_Data *sd; + const char *sn; + int32_t rc; + + eh = elf64_getehdr(elf); + if (eh == NULL) { + rc = elf_errno(); + RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n", + __func__, elf, section, rc, elf_errmsg(rc)); + return -EINVAL; + } + + if (check_elf_header(eh) != 0) + return -EINVAL; + + /* find given section by name */ + for (sc = elf_nextscn(elf, NULL); sc != NULL; + sc = elf_nextscn(elf, sc)) { + sh = elf64_getshdr(sc); + sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name); + if (sn != NULL && strcmp(section, sn) == 0 && + sh->sh_type == SHT_PROGBITS && + sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR)) + break; + } + + sd = elf_getdata(sc, NULL); + if (sd == NULL || sd->d_size == 0 || + sd->d_size % sizeof(struct ebpf_insn) != 0) { + rc = elf_errno(); + RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n", + __func__, elf, section, rc, elf_errmsg(rc)); + return -EINVAL; + } + + *psd = sd; + *pidx = elf_ndxscn(sc); + return 0; +} + +/* + * helper function to process data from relocation table. + */ +static int +process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz, + struct ebpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm) +{ + int32_t rc; + uint32_t i, n; + size_t ofs, sym; + const char *sn; + const Elf64_Ehdr *eh; + Elf_Scn *sc; + const Elf_Data *sd; + Elf64_Sym *sm; + + eh = elf64_getehdr(elf); + + /* get symtable by section index */ + sc = elf_getscn(elf, sym_idx); + sd = elf_getdata(sc, NULL); + if (sd == NULL) + return -EINVAL; + sm = sd->d_buf; + + n = re_sz / sizeof(re[0]); + for (i = 0; i != n; i++) { + + ofs = re[i].r_offset; + + /* retrieve index in the symtable */ + sym = ELF64_R_SYM(re[i].r_info); + if (sym * sizeof(sm[0]) >= sd->d_size) + return -EINVAL; + + sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name); + + rc = resolve_xsym(sn, ofs, ins, ins_sz, prm); + if (rc != 0) { + RTE_BPF_LOG(ERR, + "resolve_xsym(%s, %zu) error code: %d\n", + sn, ofs, rc); + return rc; + } + } + + return 0; +} + +/* + * helper function, find relocation information (if any) + * and update bpf code. + */ +static int +elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx, + const struct rte_bpf_prm *prm) +{ + Elf64_Rel *re; + Elf_Scn *sc; + const Elf64_Shdr *sh; + const Elf_Data *sd; + int32_t rc; + + rc = 0; + + /* walk through all sections */ + for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0; + sc = elf_nextscn(elf, sc)) { + + sh = elf64_getshdr(sc); + + /* relocation data for our code section */ + if (sh->sh_type == SHT_REL && sh->sh_info == sidx) { + sd = elf_getdata(sc, NULL); + if (sd == NULL || sd->d_size == 0 || + sd->d_size % sizeof(re[0]) != 0) + return -EINVAL; + rc = process_reloc(elf, sh->sh_link, + sd->d_buf, sd->d_size, ed->d_buf, ed->d_size, + prm); + } + } + + return rc; +} + +static struct rte_bpf * +bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section) +{ + Elf *elf; + Elf_Data *sd; + size_t sidx; + int32_t rc; + struct rte_bpf *bpf; + struct rte_bpf_prm np; + + elf_version(EV_CURRENT); + elf = elf_begin(fd, ELF_C_READ, NULL); + + rc = find_elf_code(elf, section, &sd, &sidx); + if (rc == 0) + rc = elf_reloc_code(elf, sd, sidx, prm); + + if (rc == 0) { + np = prm[0]; + np.ins = sd->d_buf; + np.nb_ins = sd->d_size / sizeof(struct ebpf_insn); + bpf = rte_bpf_load(&np); + } else { + bpf = NULL; + rte_errno = -rc; + } + + elf_end(elf); + return bpf; +} + +__rte_experimental struct rte_bpf * +rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, + const char *sname) +{ + int32_t fd, rc; + struct rte_bpf *bpf; + + if (prm == NULL || fname == NULL || sname == NULL) { + rte_errno = EINVAL; + return NULL; + } + + fd = open(fname, O_RDONLY); + if (fd < 0) { + rc = errno; + RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n", + __func__, fname, rc, strerror(rc)); + rte_errno = EINVAL; + return NULL; + } + + bpf = bpf_load_elf(prm, fd, sname); + close(fd); + + if (bpf == NULL) { + RTE_BPF_LOG(ERR, + "%s(fname=\"%s\", sname=\"%s\") failed, " + "error code: %d\n", + __func__, fname, sname, rte_errno); + return NULL; + } + + RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") " + "successfully creates %p(jit={.func=%p,.sz=%zu});\n", + __func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz); + return bpf; +} diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_pkt.c b/src/spdk/dpdk/lib/librte_bpf/bpf_pkt.c new file mode 100644 index 00000000..ab9daa52 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_pkt.c @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "bpf_impl.h" + +/* + * information about installed BPF rx/tx callback + */ + +struct bpf_eth_cbi { + /* used by both data & control path */ + uint32_t use; /*usage counter */ + const struct rte_eth_rxtx_callback *cb; /* callback handle */ + struct rte_bpf *bpf; + struct rte_bpf_jit jit; + /* used by control path only */ + LIST_ENTRY(bpf_eth_cbi) link; + uint16_t port; + uint16_t queue; +} __rte_cache_aligned; + +/* + * Odd number means that callback is used by datapath. + * Even number means that callback is not used by datapath. + */ +#define BPF_ETH_CBI_INUSE 1 + +/* + * List to manage RX/TX installed callbacks. + */ +LIST_HEAD(bpf_eth_cbi_list, bpf_eth_cbi); + +enum { + BPF_ETH_RX, + BPF_ETH_TX, + BPF_ETH_NUM, +}; + +/* + * information about all installed BPF rx/tx callbacks + */ +struct bpf_eth_cbh { + rte_spinlock_t lock; + struct bpf_eth_cbi_list list; + uint32_t type; +}; + +static struct bpf_eth_cbh rx_cbh = { + .lock = RTE_SPINLOCK_INITIALIZER, + .list = LIST_HEAD_INITIALIZER(list), + .type = BPF_ETH_RX, +}; + +static struct bpf_eth_cbh tx_cbh = { + .lock = RTE_SPINLOCK_INITIALIZER, + .list = LIST_HEAD_INITIALIZER(list), + .type = BPF_ETH_TX, +}; + +/* + * Marks given callback as used by datapath. + */ +static __rte_always_inline void +bpf_eth_cbi_inuse(struct bpf_eth_cbi *cbi) +{ + cbi->use++; + /* make sure no store/load reordering could happen */ + rte_smp_mb(); +} + +/* + * Marks given callback list as not used by datapath. + */ +static __rte_always_inline void +bpf_eth_cbi_unuse(struct bpf_eth_cbi *cbi) +{ + /* make sure all previous loads are completed */ + rte_smp_rmb(); + cbi->use++; +} + +/* + * Waits till datapath finished using given callback. + */ +static void +bpf_eth_cbi_wait(const struct bpf_eth_cbi *cbi) +{ + uint32_t nuse, puse; + + /* make sure all previous loads and stores are completed */ + rte_smp_mb(); + + puse = cbi->use; + + /* in use, busy wait till current RX/TX iteration is finished */ + if ((puse & BPF_ETH_CBI_INUSE) != 0) { + do { + rte_pause(); + rte_compiler_barrier(); + nuse = cbi->use; + } while (nuse == puse); + } +} + +static void +bpf_eth_cbi_cleanup(struct bpf_eth_cbi *bc) +{ + bc->bpf = NULL; + memset(&bc->jit, 0, sizeof(bc->jit)); +} + +static struct bpf_eth_cbi * +bpf_eth_cbh_find(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbi *cbi; + + LIST_FOREACH(cbi, &cbh->list, link) { + if (cbi->port == port && cbi->queue == queue) + break; + } + return cbi; +} + +static struct bpf_eth_cbi * +bpf_eth_cbh_add(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbi *cbi; + + /* return an existing one */ + cbi = bpf_eth_cbh_find(cbh, port, queue); + if (cbi != NULL) + return cbi; + + cbi = rte_zmalloc(NULL, sizeof(*cbi), RTE_CACHE_LINE_SIZE); + if (cbi != NULL) { + cbi->port = port; + cbi->queue = queue; + LIST_INSERT_HEAD(&cbh->list, cbi, link); + } + return cbi; +} + +/* + * BPF packet processing routinies. + */ + +static inline uint32_t +apply_filter(struct rte_mbuf *mb[], const uint64_t rc[], uint32_t num, + uint32_t drop) +{ + uint32_t i, j, k; + struct rte_mbuf *dr[num]; + + for (i = 0, j = 0, k = 0; i != num; i++) { + + /* filter matches */ + if (rc[i] != 0) + mb[j++] = mb[i]; + /* no match */ + else + dr[k++] = mb[i]; + } + + if (drop != 0) { + /* free filtered out mbufs */ + for (i = 0; i != k; i++) + rte_pktmbuf_free(dr[i]); + } else { + /* copy filtered out mbufs beyond good ones */ + for (i = 0; i != k; i++) + mb[j + i] = dr[i]; + } + + return j; +} + +static inline uint32_t +pkt_filter_vm(const struct rte_bpf *bpf, struct rte_mbuf *mb[], uint32_t num, + uint32_t drop) +{ + uint32_t i; + void *dp[num]; + uint64_t rc[num]; + + for (i = 0; i != num; i++) + dp[i] = rte_pktmbuf_mtod(mb[i], void *); + + rte_bpf_exec_burst(bpf, dp, rc, num); + return apply_filter(mb, rc, num, drop); +} + +static inline uint32_t +pkt_filter_jit(const struct rte_bpf_jit *jit, struct rte_mbuf *mb[], + uint32_t num, uint32_t drop) +{ + uint32_t i, n; + void *dp; + uint64_t rc[num]; + + n = 0; + for (i = 0; i != num; i++) { + dp = rte_pktmbuf_mtod(mb[i], void *); + rc[i] = jit->func(dp); + n += (rc[i] == 0); + } + + if (n != 0) + num = apply_filter(mb, rc, num, drop); + + return num; +} + +static inline uint32_t +pkt_filter_mb_vm(const struct rte_bpf *bpf, struct rte_mbuf *mb[], uint32_t num, + uint32_t drop) +{ + uint64_t rc[num]; + + rte_bpf_exec_burst(bpf, (void **)mb, rc, num); + return apply_filter(mb, rc, num, drop); +} + +static inline uint32_t +pkt_filter_mb_jit(const struct rte_bpf_jit *jit, struct rte_mbuf *mb[], + uint32_t num, uint32_t drop) +{ + uint32_t i, n; + uint64_t rc[num]; + + n = 0; + for (i = 0; i != num; i++) { + rc[i] = jit->func(mb[i]); + n += (rc[i] == 0); + } + + if (n != 0) + num = apply_filter(mb, rc, num, drop); + + return num; +} + +/* + * RX/TX callbacks for raw data bpf. + */ + +static uint16_t +bpf_rx_callback_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_vm(cbi->bpf, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_rx_callback_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_jit(&cbi->jit, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_vm(cbi->bpf, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_jit(&cbi->jit, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +/* + * RX/TX callbacks for mbuf. + */ + +static uint16_t +bpf_rx_callback_mb_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_vm(cbi->bpf, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_rx_callback_mb_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_jit(&cbi->jit, pkt, nb_pkts, 1) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_mb_vm(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_vm(cbi->bpf, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static uint16_t +bpf_tx_callback_mb_jit(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, void *user_param) +{ + struct bpf_eth_cbi *cbi; + uint16_t rc; + + cbi = user_param; + bpf_eth_cbi_inuse(cbi); + rc = (cbi->cb != NULL) ? + pkt_filter_mb_jit(&cbi->jit, pkt, nb_pkts, 0) : + nb_pkts; + bpf_eth_cbi_unuse(cbi); + return rc; +} + +static rte_rx_callback_fn +select_rx_callback(enum rte_bpf_arg_type type, uint32_t flags) +{ + if (flags & RTE_BPF_ETH_F_JIT) { + if (type == RTE_BPF_ARG_PTR) + return bpf_rx_callback_jit; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_rx_callback_mb_jit; + } else if (type == RTE_BPF_ARG_PTR) + return bpf_rx_callback_vm; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_rx_callback_mb_vm; + + return NULL; +} + +static rte_tx_callback_fn +select_tx_callback(enum rte_bpf_arg_type type, uint32_t flags) +{ + if (flags & RTE_BPF_ETH_F_JIT) { + if (type == RTE_BPF_ARG_PTR) + return bpf_tx_callback_jit; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_tx_callback_mb_jit; + } else if (type == RTE_BPF_ARG_PTR) + return bpf_tx_callback_vm; + else if (type == RTE_BPF_ARG_PTR_MBUF) + return bpf_tx_callback_mb_vm; + + return NULL; +} + +/* + * helper function to perform BPF unload for given port/queue. + * have to introduce extra complexity (and possible slowdown) here, + * as right now there is no safe generic way to remove RX/TX callback + * while IO is active. + * Still don't free memory allocated for callback handle itself, + * again right now there is no safe way to do that without stopping RX/TX + * on given port/queue first. + */ +static void +bpf_eth_cbi_unload(struct bpf_eth_cbi *bc) +{ + /* mark this cbi as empty */ + bc->cb = NULL; + rte_smp_mb(); + + /* make sure datapath doesn't use bpf anymore, then destroy bpf */ + bpf_eth_cbi_wait(bc); + rte_bpf_destroy(bc->bpf); + bpf_eth_cbi_cleanup(bc); +} + +static void +bpf_eth_unload(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbi *bc; + + bc = bpf_eth_cbh_find(cbh, port, queue); + if (bc == NULL || bc->cb == NULL) + return; + + if (cbh->type == BPF_ETH_RX) + rte_eth_remove_rx_callback(port, queue, bc->cb); + else + rte_eth_remove_tx_callback(port, queue, bc->cb); + + bpf_eth_cbi_unload(bc); +} + + +__rte_experimental void +rte_bpf_eth_rx_unload(uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbh *cbh; + + cbh = &rx_cbh; + rte_spinlock_lock(&cbh->lock); + bpf_eth_unload(cbh, port, queue); + rte_spinlock_unlock(&cbh->lock); +} + +__rte_experimental void +rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue) +{ + struct bpf_eth_cbh *cbh; + + cbh = &tx_cbh; + rte_spinlock_lock(&cbh->lock); + bpf_eth_unload(cbh, port, queue); + rte_spinlock_unlock(&cbh->lock); +} + +static int +bpf_eth_elf_load(struct bpf_eth_cbh *cbh, uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags) +{ + int32_t rc; + struct bpf_eth_cbi *bc; + struct rte_bpf *bpf; + rte_rx_callback_fn frx; + rte_tx_callback_fn ftx; + struct rte_bpf_jit jit; + + frx = NULL; + ftx = NULL; + + if (prm == NULL || rte_eth_dev_is_valid_port(port) == 0 || + queue >= RTE_MAX_QUEUES_PER_PORT) + return -EINVAL; + + if (cbh->type == BPF_ETH_RX) + frx = select_rx_callback(prm->prog_arg.type, flags); + else + ftx = select_tx_callback(prm->prog_arg.type, flags); + + if (frx == NULL && ftx == NULL) { + RTE_BPF_LOG(ERR, "%s(%u, %u): no callback selected;\n", + __func__, port, queue); + return -EINVAL; + } + + bpf = rte_bpf_elf_load(prm, fname, sname); + if (bpf == NULL) + return -rte_errno; + + rte_bpf_get_jit(bpf, &jit); + + if ((flags & RTE_BPF_ETH_F_JIT) != 0 && jit.func == NULL) { + RTE_BPF_LOG(ERR, "%s(%u, %u): no JIT generated;\n", + __func__, port, queue); + rte_bpf_destroy(bpf); + return -ENOTSUP; + } + + /* setup/update global callback info */ + bc = bpf_eth_cbh_add(cbh, port, queue); + if (bc == NULL) + return -ENOMEM; + + /* remove old one, if any */ + if (bc->cb != NULL) + bpf_eth_unload(cbh, port, queue); + + bc->bpf = bpf; + bc->jit = jit; + + if (cbh->type == BPF_ETH_RX) + bc->cb = rte_eth_add_rx_callback(port, queue, frx, bc); + else + bc->cb = rte_eth_add_tx_callback(port, queue, ftx, bc); + + if (bc->cb == NULL) { + rc = -rte_errno; + rte_bpf_destroy(bpf); + bpf_eth_cbi_cleanup(bc); + } else + rc = 0; + + return rc; +} + +__rte_experimental int +rte_bpf_eth_rx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags) +{ + int32_t rc; + struct bpf_eth_cbh *cbh; + + cbh = &rx_cbh; + rte_spinlock_lock(&cbh->lock); + rc = bpf_eth_elf_load(cbh, port, queue, prm, fname, sname, flags); + rte_spinlock_unlock(&cbh->lock); + + return rc; +} + +__rte_experimental int +rte_bpf_eth_tx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags) +{ + int32_t rc; + struct bpf_eth_cbh *cbh; + + cbh = &tx_cbh; + rte_spinlock_lock(&cbh->lock); + rc = bpf_eth_elf_load(cbh, port, queue, prm, fname, sname, flags); + rte_spinlock_unlock(&cbh->lock); + + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_bpf/bpf_validate.c b/src/spdk/dpdk/lib/librte_bpf/bpf_validate.c new file mode 100644 index 00000000..83983efc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/bpf_validate.c @@ -0,0 +1,2248 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "bpf_impl.h" + +struct bpf_reg_val { + struct rte_bpf_arg v; + uint64_t mask; + struct { + int64_t min; + int64_t max; + } s; + struct { + uint64_t min; + uint64_t max; + } u; +}; + +struct bpf_eval_state { + struct bpf_reg_val rv[EBPF_REG_NUM]; + struct bpf_reg_val sv[MAX_BPF_STACK_SIZE / sizeof(uint64_t)]; +}; + +/* possible instruction node colour */ +enum { + WHITE, + GREY, + BLACK, + MAX_NODE_COLOUR +}; + +/* possible edge types */ +enum { + UNKNOWN_EDGE, + TREE_EDGE, + BACK_EDGE, + CROSS_EDGE, + MAX_EDGE_TYPE +}; + +#define MAX_EDGES 2 + +struct inst_node { + uint8_t colour; + uint8_t nb_edge:4; + uint8_t cur_edge:4; + uint8_t edge_type[MAX_EDGES]; + uint32_t edge_dest[MAX_EDGES]; + uint32_t prev_node; + struct bpf_eval_state *evst; +}; + +struct bpf_verifier { + const struct rte_bpf_prm *prm; + struct inst_node *in; + uint64_t stack_sz; + uint32_t nb_nodes; + uint32_t nb_jcc_nodes; + uint32_t node_colour[MAX_NODE_COLOUR]; + uint32_t edge_type[MAX_EDGE_TYPE]; + struct bpf_eval_state *evst; + struct inst_node *evin; + struct { + uint32_t num; + uint32_t cur; + struct bpf_eval_state *ent; + } evst_pool; +}; + +struct bpf_ins_check { + struct { + uint16_t dreg; + uint16_t sreg; + } mask; + struct { + uint16_t min; + uint16_t max; + } off; + struct { + uint32_t min; + uint32_t max; + } imm; + const char * (*check)(const struct ebpf_insn *); + const char * (*eval)(struct bpf_verifier *, const struct ebpf_insn *); +}; + +#define ALL_REGS RTE_LEN2MASK(EBPF_REG_NUM, uint16_t) +#define WRT_REGS RTE_LEN2MASK(EBPF_REG_10, uint16_t) +#define ZERO_REG RTE_LEN2MASK(EBPF_REG_1, uint16_t) + +/* + * check and evaluate functions for particular instruction types. + */ + +static const char * +check_alu_bele(const struct ebpf_insn *ins) +{ + if (ins->imm != 16 && ins->imm != 32 && ins->imm != 64) + return "invalid imm field"; + return NULL; +} + +static const char * +eval_exit(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + RTE_SET_USED(ins); + if (bvf->evst->rv[EBPF_REG_0].v.type == RTE_BPF_ARG_UNDEF) + return "undefined return value"; + return NULL; +} + +/* setup max possible with this mask bounds */ +static void +eval_umax_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + rv->u.max = mask; + rv->u.min = 0; +} + +static void +eval_smax_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + rv->s.max = mask >> 1; + rv->s.min = rv->s.max ^ UINT64_MAX; +} + +static void +eval_max_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_umax_bound(rv, mask); + eval_smax_bound(rv, mask); +} + +static void +eval_fill_max_bound(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_max_bound(rv, mask); + rv->v.type = RTE_BPF_ARG_RAW; + rv->mask = mask; +} + +static void +eval_fill_imm64(struct bpf_reg_val *rv, uint64_t mask, uint64_t val) +{ + rv->mask = mask; + rv->s.min = val; + rv->s.max = val; + rv->u.min = val; + rv->u.max = val; +} + +static void +eval_fill_imm(struct bpf_reg_val *rv, uint64_t mask, int32_t imm) +{ + uint64_t v; + + v = (uint64_t)imm & mask; + + rv->v.type = RTE_BPF_ARG_RAW; + eval_fill_imm64(rv, mask, v); +} + +static const char * +eval_ld_imm64(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t i; + uint64_t val; + struct bpf_reg_val *rd; + + val = (uint32_t)ins[0].imm | (uint64_t)(uint32_t)ins[1].imm << 32; + + rd = bvf->evst->rv + ins->dst_reg; + rd->v.type = RTE_BPF_ARG_RAW; + eval_fill_imm64(rd, UINT64_MAX, val); + + for (i = 0; i != bvf->prm->nb_xsym; i++) { + + /* load of external variable */ + if (bvf->prm->xsym[i].type == RTE_BPF_XTYPE_VAR && + (uintptr_t)bvf->prm->xsym[i].var.val == val) { + rd->v = bvf->prm->xsym[i].var.desc; + eval_fill_imm64(rd, UINT64_MAX, 0); + break; + } + } + + return NULL; +} + +static void +eval_apply_mask(struct bpf_reg_val *rv, uint64_t mask) +{ + struct bpf_reg_val rt; + + rt.u.min = rv->u.min & mask; + rt.u.max = rv->u.max & mask; + if (rt.u.min != rv->u.min || rt.u.max != rv->u.max) { + rv->u.max = RTE_MAX(rt.u.max, mask); + rv->u.min = 0; + } + + eval_smax_bound(&rt, mask); + rv->s.max = RTE_MIN(rt.s.max, rv->s.max); + rv->s.min = RTE_MAX(rt.s.min, rv->s.min); + + rv->mask = mask; +} + +static void +eval_add(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, uint64_t msk) +{ + struct bpf_reg_val rv; + + rv.u.min = (rd->u.min + rs->u.min) & msk; + rv.u.max = (rd->u.min + rs->u.max) & msk; + rv.s.min = (rd->s.min + rs->s.min) & msk; + rv.s.max = (rd->s.max + rs->s.max) & msk; + + /* + * if at least one of the operands is not constant, + * then check for overflow + */ + if ((rd->u.min != rd->u.max || rs->u.min != rs->u.max) && + (rv.u.min < rd->u.min || rv.u.max < rd->u.max)) + eval_umax_bound(&rv, msk); + + if ((rd->s.min != rd->s.max || rs->s.min != rs->s.max) && + (((rs->s.min < 0 && rv.s.min > rd->s.min) || + rv.s.min < rd->s.min) || + ((rs->s.max < 0 && rv.s.max > rd->s.max) || + rv.s.max < rd->s.max))) + eval_smax_bound(&rv, msk); + + rd->s = rv.s; + rd->u = rv.u; +} + +static void +eval_sub(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, uint64_t msk) +{ + struct bpf_reg_val rv; + + rv.u.min = (rd->u.min - rs->u.min) & msk; + rv.u.max = (rd->u.min - rs->u.max) & msk; + rv.s.min = (rd->s.min - rs->s.min) & msk; + rv.s.max = (rd->s.max - rs->s.max) & msk; + + /* + * if at least one of the operands is not constant, + * then check for overflow + */ + if ((rd->u.min != rd->u.max || rs->u.min != rs->u.max) && + (rv.u.min > rd->u.min || rv.u.max > rd->u.max)) + eval_umax_bound(&rv, msk); + + if ((rd->s.min != rd->s.max || rs->s.min != rs->s.max) && + (((rs->s.min < 0 && rv.s.min < rd->s.min) || + rv.s.min > rd->s.min) || + ((rs->s.max < 0 && rv.s.max < rd->s.max) || + rv.s.max > rd->s.max))) + eval_smax_bound(&rv, msk); + + rd->s = rv.s; + rd->u = rv.u; +} + +static void +eval_lsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + /* check for overflow */ + if (rd->u.max > RTE_LEN2MASK(opsz - rs->u.max, uint64_t)) + eval_umax_bound(rd, msk); + else { + rd->u.max <<= rs->u.max; + rd->u.min <<= rs->u.min; + } + + /* check that dreg values are and would remain always positive */ + if ((uint64_t)rd->s.min >> (opsz - 1) != 0 || rd->s.max >= + RTE_LEN2MASK(opsz - rs->u.max - 1, int64_t)) + eval_smax_bound(rd, msk); + else { + rd->s.max <<= rs->u.max; + rd->s.min <<= rs->u.min; + } +} + +static void +eval_rsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + rd->u.max >>= rs->u.min; + rd->u.min >>= rs->u.max; + + /* check that dreg values are always positive */ + if ((uint64_t)rd->s.min >> (opsz - 1) != 0) + eval_smax_bound(rd, msk); + else { + rd->s.max >>= rs->u.min; + rd->s.min >>= rs->u.max; + } +} + +static void +eval_arsh(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + uint32_t shv; + + /* check if shift value is less then max result bits */ + if (rs->u.max >= opsz) { + eval_max_bound(rd, msk); + return; + } + + rd->u.max = (int64_t)rd->u.max >> rs->u.min; + rd->u.min = (int64_t)rd->u.min >> rs->u.max; + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min <<= opsz; + rd->s.max <<= opsz; + shv = opsz; + } else + shv = 0; + + if (rd->s.min < 0) + rd->s.min = (rd->s.min >> (rs->u.min + shv)) & msk; + else + rd->s.min = (rd->s.min >> (rs->u.max + shv)) & msk; + + if (rd->s.max < 0) + rd->s.max = (rd->s.max >> (rs->u.max + shv)) & msk; + else + rd->s.max = (rd->s.max >> (rs->u.min + shv)) & msk; +} + +static uint64_t +eval_umax_bits(uint64_t v, size_t opsz) +{ + if (v == 0) + return 0; + + v = __builtin_clzll(v); + return RTE_LEN2MASK(opsz - v, uint64_t); +} + +/* estimate max possible value for (v1 & v2) */ +static uint64_t +eval_uand_max(uint64_t v1, uint64_t v2, size_t opsz) +{ + v1 = eval_umax_bits(v1, opsz); + v2 = eval_umax_bits(v2, opsz); + return (v1 & v2); +} + +/* estimate max possible value for (v1 | v2) */ +static uint64_t +eval_uor_max(uint64_t v1, uint64_t v2, size_t opsz) +{ + v1 = eval_umax_bits(v1, opsz); + v2 = eval_umax_bits(v2, opsz); + return (v1 | v2); +} + +static void +eval_and(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min &= rs->u.min; + rd->u.max &= rs->u.max; + } else { + rd->u.max = eval_uand_max(rd->u.max, rs->u.max, opsz); + rd->u.min &= rs->u.min; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min &= rs->s.min; + rd->s.max &= rs->s.max; + /* at least one of operand is non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uand_max(rd->s.max & (msk >> 1), + rs->s.max & (msk >> 1), opsz); + rd->s.min &= rs->s.min; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_or(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min |= rs->u.min; + rd->u.max |= rs->u.max; + } else { + rd->u.max = eval_uor_max(rd->u.max, rs->u.max, opsz); + rd->u.min |= rs->u.min; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min |= rs->s.min; + rd->s.max |= rs->s.max; + + /* both operands are non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uor_max(rd->s.max, rs->s.max, opsz); + rd->s.min |= rs->s.min; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_xor(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min ^= rs->u.min; + rd->u.max ^= rs->u.max; + } else { + rd->u.max = eval_uor_max(rd->u.max, rs->u.max, opsz); + rd->u.min = 0; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min ^= rs->s.min; + rd->s.max ^= rs->s.max; + + /* both operands are non-negative */ + } else if (rd->s.min >= 0 || rs->s.min >= 0) { + rd->s.max = eval_uor_max(rd->s.max, rs->s.max, opsz); + rd->s.min = 0; + } else + eval_smax_bound(rd, msk); +} + +static void +eval_mul(struct bpf_reg_val *rd, const struct bpf_reg_val *rs, size_t opsz, + uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + rd->u.min = (rd->u.min * rs->u.min) & msk; + rd->u.max = (rd->u.max * rs->u.max) & msk; + /* check for overflow */ + } else if (rd->u.max <= msk >> opsz / 2 && rs->u.max <= msk >> opsz) { + rd->u.max *= rs->u.max; + rd->u.min *= rd->u.min; + } else + eval_umax_bound(rd, msk); + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + rd->s.min = (rd->s.min * rs->s.min) & msk; + rd->s.max = (rd->s.max * rs->s.max) & msk; + /* check that both operands are positive and no overflow */ + } else if (rd->s.min >= 0 && rs->s.min >= 0) { + rd->s.max *= rs->s.max; + rd->s.min *= rd->s.min; + } else + eval_smax_bound(rd, msk); +} + +static const char * +eval_divmod(uint32_t op, struct bpf_reg_val *rd, struct bpf_reg_val *rs, + size_t opsz, uint64_t msk) +{ + /* both operands are constants */ + if (rd->u.min == rd->u.max && rs->u.min == rs->u.max) { + if (rs->u.max == 0) + return "division by 0"; + if (op == BPF_DIV) { + rd->u.min /= rs->u.min; + rd->u.max /= rs->u.max; + } else { + rd->u.min %= rs->u.min; + rd->u.max %= rs->u.max; + } + } else { + if (op == BPF_MOD) + rd->u.max = RTE_MIN(rd->u.max, rs->u.max - 1); + else + rd->u.max = rd->u.max; + rd->u.min = 0; + } + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min = (int32_t)rd->s.min; + rd->s.max = (int32_t)rd->s.max; + rs->s.min = (int32_t)rs->s.min; + rs->s.max = (int32_t)rs->s.max; + } + + /* both operands are constants */ + if (rd->s.min == rd->s.max && rs->s.min == rs->s.max) { + if (rs->s.max == 0) + return "division by 0"; + if (op == BPF_DIV) { + rd->s.min /= rs->s.min; + rd->s.max /= rs->s.max; + } else { + rd->s.min %= rs->s.min; + rd->s.max %= rs->s.max; + } + } else if (op == BPF_MOD) { + rd->s.min = RTE_MAX(rd->s.max, 0); + rd->s.min = RTE_MIN(rd->s.min, 0); + } else + eval_smax_bound(rd, msk); + + rd->s.max &= msk; + rd->s.min &= msk; + + return NULL; +} + +static void +eval_neg(struct bpf_reg_val *rd, size_t opsz, uint64_t msk) +{ + uint64_t ux, uy; + int64_t sx, sy; + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->u.min = (int32_t)rd->u.min; + rd->u.max = (int32_t)rd->u.max; + } + + ux = -(int64_t)rd->u.min & msk; + uy = -(int64_t)rd->u.max & msk; + + rd->u.max = RTE_MAX(ux, uy); + rd->u.min = RTE_MIN(ux, uy); + + /* if we have 32-bit values - extend them to 64-bit */ + if (opsz == sizeof(uint32_t) * CHAR_BIT) { + rd->s.min = (int32_t)rd->s.min; + rd->s.max = (int32_t)rd->s.max; + } + + sx = -rd->s.min & msk; + sy = -rd->s.max & msk; + + rd->s.max = RTE_MAX(sx, sy); + rd->s.min = RTE_MIN(sx, sy); +} + +/* + * check that destination and source operand are in defined state. + */ +static const char * +eval_defined(const struct bpf_reg_val *dst, const struct bpf_reg_val *src) +{ + if (dst != NULL && dst->v.type == RTE_BPF_ARG_UNDEF) + return "dest reg value is undefined"; + if (src != NULL && src->v.type == RTE_BPF_ARG_UNDEF) + return "src reg value is undefined"; + return NULL; +} + +static const char * +eval_alu(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + uint32_t op; + size_t opsz; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val *rd, rs; + + opsz = (BPF_CLASS(ins->code) == BPF_ALU) ? + sizeof(uint32_t) : sizeof(uint64_t); + opsz = opsz * CHAR_BIT; + msk = RTE_LEN2MASK(opsz, uint64_t); + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + + if (BPF_SRC(ins->code) == BPF_X) { + rs = st->rv[ins->src_reg]; + eval_apply_mask(&rs, msk); + } else + eval_fill_imm(&rs, msk, ins->imm); + + eval_apply_mask(rd, msk); + + op = BPF_OP(ins->code); + + err = eval_defined((op != EBPF_MOV) ? rd : NULL, + (op != BPF_NEG) ? &rs : NULL); + if (err != NULL) + return err; + + if (op == BPF_ADD) + eval_add(rd, &rs, msk); + else if (op == BPF_SUB) + eval_sub(rd, &rs, msk); + else if (op == BPF_LSH) + eval_lsh(rd, &rs, opsz, msk); + else if (op == BPF_RSH) + eval_rsh(rd, &rs, opsz, msk); + else if (op == EBPF_ARSH) + eval_arsh(rd, &rs, opsz, msk); + else if (op == BPF_AND) + eval_and(rd, &rs, opsz, msk); + else if (op == BPF_OR) + eval_or(rd, &rs, opsz, msk); + else if (op == BPF_XOR) + eval_xor(rd, &rs, opsz, msk); + else if (op == BPF_MUL) + eval_mul(rd, &rs, opsz, msk); + else if (op == BPF_DIV || op == BPF_MOD) + err = eval_divmod(op, rd, &rs, opsz, msk); + else if (op == BPF_NEG) + eval_neg(rd, opsz, msk); + else if (op == EBPF_MOV) + *rd = rs; + else + eval_max_bound(rd, msk); + + return err; +} + +static const char * +eval_bele(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + struct bpf_eval_state *st; + struct bpf_reg_val *rd; + const char *err; + + msk = RTE_LEN2MASK(ins->imm, uint64_t); + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + + err = eval_defined(rd, NULL); + if (err != NULL) + return err; + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + if (ins->code == (BPF_ALU | EBPF_END | EBPF_TO_BE)) + eval_max_bound(rd, msk); + else + eval_apply_mask(rd, msk); +#else + if (ins->code == (BPF_ALU | EBPF_END | EBPF_TO_LE)) + eval_max_bound(rd, msk); + else + eval_apply_mask(rd, msk); +#endif + + return NULL; +} + +static const char * +eval_ptr(struct bpf_verifier *bvf, struct bpf_reg_val *rm, uint32_t opsz, + uint32_t align, int16_t off) +{ + struct bpf_reg_val rv; + + /* calculate reg + offset */ + eval_fill_imm(&rv, rm->mask, off); + eval_add(rm, &rv, rm->mask); + + if (RTE_BPF_ARG_PTR_TYPE(rm->v.type) == 0) + return "destination is not a pointer"; + + if (rm->mask != UINT64_MAX) + return "pointer truncation"; + + if (rm->u.max + opsz > rm->v.size || + (uint64_t)rm->s.max + opsz > rm->v.size || + rm->s.min < 0) + return "memory boundary violation"; + + if (rm->u.max % align != 0) + return "unaligned memory access"; + + if (rm->v.type == RTE_BPF_ARG_PTR_STACK) { + + if (rm->u.max != rm->u.min || rm->s.max != rm->s.min || + rm->u.max != (uint64_t)rm->s.max) + return "stack access with variable offset"; + + bvf->stack_sz = RTE_MAX(bvf->stack_sz, rm->v.size - rm->u.max); + + /* pointer to mbuf */ + } else if (rm->v.type == RTE_BPF_ARG_PTR_MBUF) { + + if (rm->u.max != rm->u.min || rm->s.max != rm->s.min || + rm->u.max != (uint64_t)rm->s.max) + return "mbuf access with variable offset"; + } + + return NULL; +} + +static void +eval_max_load(struct bpf_reg_val *rv, uint64_t mask) +{ + eval_umax_bound(rv, mask); + + /* full 64-bit load */ + if (mask == UINT64_MAX) + eval_smax_bound(rv, mask); + + /* zero-extend load */ + rv->s.min = rv->u.min; + rv->s.max = rv->u.max; +} + + +static const char * +eval_load(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t opsz; + uint64_t msk; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val *rd, rs; + const struct bpf_reg_val *sv; + + st = bvf->evst; + rd = st->rv + ins->dst_reg; + rs = st->rv[ins->src_reg]; + opsz = bpf_size(BPF_SIZE(ins->code)); + msk = RTE_LEN2MASK(opsz * CHAR_BIT, uint64_t); + + err = eval_ptr(bvf, &rs, opsz, 1, ins->off); + if (err != NULL) + return err; + + if (rs.v.type == RTE_BPF_ARG_PTR_STACK) { + + sv = st->sv + rs.u.max / sizeof(uint64_t); + if (sv->v.type == RTE_BPF_ARG_UNDEF || sv->mask < msk) + return "undefined value on the stack"; + + *rd = *sv; + + /* pointer to mbuf */ + } else if (rs.v.type == RTE_BPF_ARG_PTR_MBUF) { + + if (rs.u.max == offsetof(struct rte_mbuf, next)) { + eval_fill_imm(rd, msk, 0); + rd->v = rs.v; + } else if (rs.u.max == offsetof(struct rte_mbuf, buf_addr)) { + eval_fill_imm(rd, msk, 0); + rd->v.type = RTE_BPF_ARG_PTR; + rd->v.size = rs.v.buf_size; + } else if (rs.u.max == offsetof(struct rte_mbuf, data_off)) { + eval_fill_imm(rd, msk, RTE_PKTMBUF_HEADROOM); + rd->v.type = RTE_BPF_ARG_RAW; + } else { + eval_max_load(rd, msk); + rd->v.type = RTE_BPF_ARG_RAW; + } + + /* pointer to raw data */ + } else { + eval_max_load(rd, msk); + rd->v.type = RTE_BPF_ARG_RAW; + } + + return NULL; +} + +static const char * +eval_mbuf_store(const struct bpf_reg_val *rv, uint32_t opsz) +{ + uint32_t i; + + static const struct { + size_t off; + size_t sz; + } mbuf_ro_fileds[] = { + { .off = offsetof(struct rte_mbuf, buf_addr), }, + { .off = offsetof(struct rte_mbuf, refcnt), }, + { .off = offsetof(struct rte_mbuf, nb_segs), }, + { .off = offsetof(struct rte_mbuf, buf_len), }, + { .off = offsetof(struct rte_mbuf, pool), }, + { .off = offsetof(struct rte_mbuf, next), }, + { .off = offsetof(struct rte_mbuf, priv_size), }, + }; + + for (i = 0; i != RTE_DIM(mbuf_ro_fileds) && + (mbuf_ro_fileds[i].off + mbuf_ro_fileds[i].sz <= + rv->u.max || rv->u.max + opsz <= mbuf_ro_fileds[i].off); + i++) + ; + + if (i != RTE_DIM(mbuf_ro_fileds)) + return "store to the read-only mbuf field"; + + return NULL; + +} + +static const char * +eval_store(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t opsz; + uint64_t msk; + const char *err; + struct bpf_eval_state *st; + struct bpf_reg_val rd, rs, *sv; + + opsz = bpf_size(BPF_SIZE(ins->code)); + msk = RTE_LEN2MASK(opsz * CHAR_BIT, uint64_t); + + st = bvf->evst; + rd = st->rv[ins->dst_reg]; + + if (BPF_CLASS(ins->code) == BPF_STX) { + rs = st->rv[ins->src_reg]; + eval_apply_mask(&rs, msk); + } else + eval_fill_imm(&rs, msk, ins->imm); + + err = eval_defined(NULL, &rs); + if (err != NULL) + return err; + + err = eval_ptr(bvf, &rd, opsz, 1, ins->off); + if (err != NULL) + return err; + + if (rd.v.type == RTE_BPF_ARG_PTR_STACK) { + + sv = st->sv + rd.u.max / sizeof(uint64_t); + if (BPF_CLASS(ins->code) == BPF_STX && + BPF_MODE(ins->code) == EBPF_XADD) + eval_max_bound(sv, msk); + else + *sv = rs; + + /* pointer to mbuf */ + } else if (rd.v.type == RTE_BPF_ARG_PTR_MBUF) { + err = eval_mbuf_store(&rd, opsz); + if (err != NULL) + return err; + } + + return NULL; +} + +static const char * +eval_func_arg(struct bpf_verifier *bvf, const struct rte_bpf_arg *arg, + struct bpf_reg_val *rv) +{ + uint32_t i, n; + struct bpf_eval_state *st; + const char *err; + + st = bvf->evst; + + if (rv->v.type == RTE_BPF_ARG_UNDEF) + return "Undefined argument type"; + + if (arg->type != rv->v.type && + arg->type != RTE_BPF_ARG_RAW && + (arg->type != RTE_BPF_ARG_PTR || + RTE_BPF_ARG_PTR_TYPE(rv->v.type) == 0)) + return "Invalid argument type"; + + err = NULL; + + /* argument is a pointer */ + if (RTE_BPF_ARG_PTR_TYPE(arg->type) != 0) { + + err = eval_ptr(bvf, rv, arg->size, 1, 0); + + /* + * pointer to the variable on the stack is passed + * as an argument, mark stack space it occupies as initialized. + */ + if (err == NULL && rv->v.type == RTE_BPF_ARG_PTR_STACK) { + + i = rv->u.max / sizeof(uint64_t); + n = i + arg->size / sizeof(uint64_t); + while (i != n) { + eval_fill_max_bound(st->sv + i, UINT64_MAX); + i++; + }; + } + } + + return err; +} + +static const char * +eval_call(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint64_t msk; + uint32_t i, idx; + struct bpf_reg_val *rv; + const struct rte_bpf_xsym *xsym; + const char *err; + + idx = ins->imm; + + if (idx >= bvf->prm->nb_xsym || + bvf->prm->xsym[idx].type != RTE_BPF_XTYPE_FUNC) + return "invalid external function index"; + + /* for now don't support function calls on 32 bit platform */ + if (sizeof(uint64_t) != sizeof(uintptr_t)) + return "function calls are supported only for 64 bit apps"; + + xsym = bvf->prm->xsym + idx; + + /* evaluate function arguments */ + err = NULL; + for (i = 0; i != xsym->func.nb_args && err == NULL; i++) { + err = eval_func_arg(bvf, xsym->func.args + i, + bvf->evst->rv + EBPF_REG_1 + i); + } + + /* R1-R5 argument/scratch registers */ + for (i = EBPF_REG_1; i != EBPF_REG_6; i++) + bvf->evst->rv[i].v.type = RTE_BPF_ARG_UNDEF; + + /* update return value */ + + rv = bvf->evst->rv + EBPF_REG_0; + rv->v = xsym->func.ret; + msk = (rv->v.type == RTE_BPF_ARG_RAW) ? + RTE_LEN2MASK(rv->v.size * CHAR_BIT, uint64_t) : UINTPTR_MAX; + eval_max_bound(rv, msk); + rv->mask = msk; + + return err; +} + +static void +eval_jeq_jne(struct bpf_reg_val *trd, struct bpf_reg_val *trs) +{ + /* sreg is constant */ + if (trs->u.min == trs->u.max) { + trd->u = trs->u; + /* dreg is constant */ + } else if (trd->u.min == trd->u.max) { + trs->u = trd->u; + } else { + trd->u.max = RTE_MIN(trd->u.max, trs->u.max); + trd->u.min = RTE_MAX(trd->u.min, trs->u.min); + trs->u = trd->u; + } + + /* sreg is constant */ + if (trs->s.min == trs->s.max) { + trd->s = trs->s; + /* dreg is constant */ + } else if (trd->s.min == trd->s.max) { + trs->s = trd->s; + } else { + trd->s.max = RTE_MIN(trd->s.max, trs->s.max); + trd->s.min = RTE_MAX(trd->s.min, trs->s.min); + trs->s = trd->s; + } +} + +static void +eval_jgt_jle(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->u.max = RTE_MIN(frd->u.max, frs->u.min); + trd->u.min = RTE_MAX(trd->u.min, trs->u.min + 1); +} + +static void +eval_jlt_jge(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->u.min = RTE_MAX(frd->u.min, frs->u.min); + trd->u.max = RTE_MIN(trd->u.max, trs->u.max - 1); +} + +static void +eval_jsgt_jsle(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->s.max = RTE_MIN(frd->s.max, frs->s.min); + trd->s.min = RTE_MAX(trd->s.min, trs->s.min + 1); +} + +static void +eval_jslt_jsge(struct bpf_reg_val *trd, struct bpf_reg_val *trs, + struct bpf_reg_val *frd, struct bpf_reg_val *frs) +{ + frd->s.min = RTE_MAX(frd->s.min, frs->s.min); + trd->s.max = RTE_MIN(trd->s.max, trs->s.max - 1); +} + +static const char * +eval_jcc(struct bpf_verifier *bvf, const struct ebpf_insn *ins) +{ + uint32_t op; + const char *err; + struct bpf_eval_state *fst, *tst; + struct bpf_reg_val *frd, *frs, *trd, *trs; + struct bpf_reg_val rvf, rvt; + + tst = bvf->evst; + fst = bvf->evin->evst; + + frd = fst->rv + ins->dst_reg; + trd = tst->rv + ins->dst_reg; + + if (BPF_SRC(ins->code) == BPF_X) { + frs = fst->rv + ins->src_reg; + trs = tst->rv + ins->src_reg; + } else { + frs = &rvf; + trs = &rvt; + eval_fill_imm(frs, UINT64_MAX, ins->imm); + eval_fill_imm(trs, UINT64_MAX, ins->imm); + } + + err = eval_defined(trd, trs); + if (err != NULL) + return err; + + op = BPF_OP(ins->code); + + if (op == BPF_JEQ) + eval_jeq_jne(trd, trs); + else if (op == EBPF_JNE) + eval_jeq_jne(frd, frs); + else if (op == BPF_JGT) + eval_jgt_jle(trd, trs, frd, frs); + else if (op == EBPF_JLE) + eval_jgt_jle(frd, frs, trd, trs); + else if (op == EBPF_JLT) + eval_jlt_jge(trd, trs, frd, frs); + else if (op == BPF_JGE) + eval_jlt_jge(frd, frs, trd, trs); + else if (op == EBPF_JSGT) + eval_jsgt_jsle(trd, trs, frd, frs); + else if (op == EBPF_JSLE) + eval_jsgt_jsle(frd, frs, trd, trs); + else if (op == EBPF_JLT) + eval_jslt_jsge(trd, trs, frd, frs); + else if (op == EBPF_JSGE) + eval_jslt_jsge(frd, frs, trd, trs); + + return NULL; +} + +/* + * validate parameters for each instruction type. + */ +static const struct bpf_ins_check ins_chk[UINT8_MAX] = { + /* ALU IMM 32-bit instructions */ + [(BPF_ALU | BPF_ADD | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_SUB | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_AND | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_OR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_LSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_RSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_XOR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MUL | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | EBPF_MOV | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_DIV | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MOD | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + /* ALU IMM 64-bit instructions */ + [(EBPF_ALU64 | BPF_ADD | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_SUB | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_AND | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_OR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_LSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_RSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_ARSH | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_XOR | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MUL | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_MOV | BPF_K)] = { + .mask = {.dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX,}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_DIV | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MOD | BPF_K)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 1, .max = UINT32_MAX}, + .eval = eval_alu, + }, + /* ALU REG 32-bit instructions */ + [(BPF_ALU | BPF_ADD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_SUB | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_AND | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_OR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_LSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_RSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_XOR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MUL | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_DIV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_MOD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | EBPF_MOV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | BPF_NEG)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(BPF_ALU | EBPF_END | EBPF_TO_BE)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 16, .max = 64}, + .check = check_alu_bele, + .eval = eval_bele, + }, + [(BPF_ALU | EBPF_END | EBPF_TO_LE)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 16, .max = 64}, + .check = check_alu_bele, + .eval = eval_bele, + }, + /* ALU REG 64-bit instructions */ + [(EBPF_ALU64 | BPF_ADD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_SUB | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_AND | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_OR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_LSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_RSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_ARSH | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_XOR | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MUL | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_DIV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_MOD | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | EBPF_MOV | BPF_X)] = { + .mask = { .dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + [(EBPF_ALU64 | BPF_NEG)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_alu, + }, + /* load instructions */ + [(BPF_LDX | BPF_MEM | BPF_B)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + [(BPF_LDX | BPF_MEM | BPF_H)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + [(BPF_LDX | BPF_MEM | BPF_W)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + [(BPF_LDX | BPF_MEM | EBPF_DW)] = { + .mask = {. dreg = WRT_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_load, + }, + /* load 64 bit immediate value */ + [(BPF_LD | BPF_IMM | EBPF_DW)] = { + .mask = { .dreg = WRT_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_ld_imm64, + }, + /* store REG instructions */ + [(BPF_STX | BPF_MEM | BPF_B)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | BPF_MEM | BPF_H)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | BPF_MEM | BPF_W)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | BPF_MEM | EBPF_DW)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + /* atomic add instructions */ + [(BPF_STX | EBPF_XADD | BPF_W)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + [(BPF_STX | EBPF_XADD | EBPF_DW)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_store, + }, + /* store IMM instructions */ + [(BPF_ST | BPF_MEM | BPF_B)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + [(BPF_ST | BPF_MEM | BPF_H)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + [(BPF_ST | BPF_MEM | BPF_W)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + [(BPF_ST | BPF_MEM | EBPF_DW)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_store, + }, + /* jump instruction */ + [(BPF_JMP | BPF_JA)] = { + .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + }, + /* jcc IMM instructions */ + [(BPF_JMP | BPF_JEQ | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JNE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSGT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLT | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSGE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLE | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JSET | BPF_K)] = { + .mask = { .dreg = ALL_REGS, .sreg = ZERO_REG}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_jcc, + }, + /* jcc REG instructions */ + [(BPF_JMP | BPF_JEQ | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JNE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JGE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JLE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSGT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLT | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + }, + [(BPF_JMP | EBPF_JSGE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | EBPF_JSLE | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + [(BPF_JMP | BPF_JSET | BPF_X)] = { + .mask = { .dreg = ALL_REGS, .sreg = ALL_REGS}, + .off = { .min = 0, .max = UINT16_MAX}, + .imm = { .min = 0, .max = 0}, + .eval = eval_jcc, + }, + /* call instruction */ + [(BPF_JMP | EBPF_CALL)] = { + .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = UINT32_MAX}, + .eval = eval_call, + }, + /* ret instruction */ + [(BPF_JMP | EBPF_EXIT)] = { + .mask = { .dreg = ZERO_REG, .sreg = ZERO_REG}, + .off = { .min = 0, .max = 0}, + .imm = { .min = 0, .max = 0}, + .eval = eval_exit, + }, +}; + +/* + * make sure that instruction syntax is valid, + * and it fields don't violate partciular instrcution type restrictions. + */ +static const char * +check_syntax(const struct ebpf_insn *ins) +{ + + uint8_t op; + uint16_t off; + uint32_t imm; + + op = ins->code; + + if (ins_chk[op].mask.dreg == 0) + return "invalid opcode"; + + if ((ins_chk[op].mask.dreg & 1 << ins->dst_reg) == 0) + return "invalid dst-reg field"; + + if ((ins_chk[op].mask.sreg & 1 << ins->src_reg) == 0) + return "invalid src-reg field"; + + off = ins->off; + if (ins_chk[op].off.min > off || ins_chk[op].off.max < off) + return "invalid off field"; + + imm = ins->imm; + if (ins_chk[op].imm.min > imm || ins_chk[op].imm.max < imm) + return "invalid imm field"; + + if (ins_chk[op].check != NULL) + return ins_chk[op].check(ins); + + return NULL; +} + +/* + * helper function, return instruction index for the given node. + */ +static uint32_t +get_node_idx(const struct bpf_verifier *bvf, const struct inst_node *node) +{ + return node - bvf->in; +} + +/* + * helper function, used to walk through constructed CFG. + */ +static struct inst_node * +get_next_node(struct bpf_verifier *bvf, struct inst_node *node) +{ + uint32_t ce, ne, dst; + + ne = node->nb_edge; + ce = node->cur_edge; + if (ce == ne) + return NULL; + + node->cur_edge++; + dst = node->edge_dest[ce]; + return bvf->in + dst; +} + +static void +set_node_colour(struct bpf_verifier *bvf, struct inst_node *node, + uint32_t new) +{ + uint32_t prev; + + prev = node->colour; + node->colour = new; + + bvf->node_colour[prev]--; + bvf->node_colour[new]++; +} + +/* + * helper function, add new edge between two nodes. + */ +static int +add_edge(struct bpf_verifier *bvf, struct inst_node *node, uint32_t nidx) +{ + uint32_t ne; + + if (nidx > bvf->prm->nb_ins) { + RTE_BPF_LOG(ERR, "%s: program boundary violation at pc: %u, " + "next pc: %u\n", + __func__, get_node_idx(bvf, node), nidx); + return -EINVAL; + } + + ne = node->nb_edge; + if (ne >= RTE_DIM(node->edge_dest)) { + RTE_BPF_LOG(ERR, "%s: internal error at pc: %u\n", + __func__, get_node_idx(bvf, node)); + return -EINVAL; + } + + node->edge_dest[ne] = nidx; + node->nb_edge = ne + 1; + return 0; +} + +/* + * helper function, determine type of edge between two nodes. + */ +static void +set_edge_type(struct bpf_verifier *bvf, struct inst_node *node, + const struct inst_node *next) +{ + uint32_t ce, clr, type; + + ce = node->cur_edge - 1; + clr = next->colour; + + type = UNKNOWN_EDGE; + + if (clr == WHITE) + type = TREE_EDGE; + else if (clr == GREY) + type = BACK_EDGE; + else if (clr == BLACK) + /* + * in fact it could be either direct or cross edge, + * but for now, we don't need to distinguish between them. + */ + type = CROSS_EDGE; + + node->edge_type[ce] = type; + bvf->edge_type[type]++; +} + +static struct inst_node * +get_prev_node(struct bpf_verifier *bvf, struct inst_node *node) +{ + return bvf->in + node->prev_node; +} + +/* + * Depth-First Search (DFS) through previously constructed + * Control Flow Graph (CFG). + * Information collected at this path would be used later + * to determine is there any loops, and/or unreachable instructions. + */ +static void +dfs(struct bpf_verifier *bvf) +{ + struct inst_node *next, *node; + + node = bvf->in; + while (node != NULL) { + + if (node->colour == WHITE) + set_node_colour(bvf, node, GREY); + + if (node->colour == GREY) { + + /* find next unprocessed child node */ + do { + next = get_next_node(bvf, node); + if (next == NULL) + break; + set_edge_type(bvf, node, next); + } while (next->colour != WHITE); + + if (next != NULL) { + /* proceed with next child */ + next->prev_node = get_node_idx(bvf, node); + node = next; + } else { + /* + * finished with current node and all it's kids, + * proceed with parent + */ + set_node_colour(bvf, node, BLACK); + node->cur_edge = 0; + node = get_prev_node(bvf, node); + } + } else + node = NULL; + } +} + +/* + * report unreachable instructions. + */ +static void +log_unreachable(const struct bpf_verifier *bvf) +{ + uint32_t i; + struct inst_node *node; + const struct ebpf_insn *ins; + + for (i = 0; i != bvf->prm->nb_ins; i++) { + + node = bvf->in + i; + ins = bvf->prm->ins + i; + + if (node->colour == WHITE && + ins->code != (BPF_LD | BPF_IMM | EBPF_DW)) + RTE_BPF_LOG(ERR, "unreachable code at pc: %u;\n", i); + } +} + +/* + * report loops detected. + */ +static void +log_loop(const struct bpf_verifier *bvf) +{ + uint32_t i, j; + struct inst_node *node; + + for (i = 0; i != bvf->prm->nb_ins; i++) { + + node = bvf->in + i; + if (node->colour != BLACK) + continue; + + for (j = 0; j != node->nb_edge; j++) { + if (node->edge_type[j] == BACK_EDGE) + RTE_BPF_LOG(ERR, + "loop at pc:%u --> pc:%u;\n", + i, node->edge_dest[j]); + } + } +} + +/* + * First pass goes though all instructions in the set, checks that each + * instruction is a valid one (correct syntax, valid field values, etc.) + * and constructs control flow graph (CFG). + * Then deapth-first search is performed over the constructed graph. + * Programs with unreachable instructions and/or loops will be rejected. + */ +static int +validate(struct bpf_verifier *bvf) +{ + int32_t rc; + uint32_t i; + struct inst_node *node; + const struct ebpf_insn *ins; + const char *err; + + rc = 0; + for (i = 0; i < bvf->prm->nb_ins; i++) { + + ins = bvf->prm->ins + i; + node = bvf->in + i; + + err = check_syntax(ins); + if (err != 0) { + RTE_BPF_LOG(ERR, "%s: %s at pc: %u\n", + __func__, err, i); + rc |= -EINVAL; + } + + /* + * construct CFG, jcc nodes have to outgoing edges, + * 'exit' nodes - none, all others nodes have exaclty one + * outgoing edge. + */ + switch (ins->code) { + case (BPF_JMP | EBPF_EXIT): + break; + case (BPF_JMP | BPF_JEQ | BPF_K): + case (BPF_JMP | EBPF_JNE | BPF_K): + case (BPF_JMP | BPF_JGT | BPF_K): + case (BPF_JMP | EBPF_JLT | BPF_K): + case (BPF_JMP | BPF_JGE | BPF_K): + case (BPF_JMP | EBPF_JLE | BPF_K): + case (BPF_JMP | EBPF_JSGT | BPF_K): + case (BPF_JMP | EBPF_JSLT | BPF_K): + case (BPF_JMP | EBPF_JSGE | BPF_K): + case (BPF_JMP | EBPF_JSLE | BPF_K): + case (BPF_JMP | BPF_JSET | BPF_K): + case (BPF_JMP | BPF_JEQ | BPF_X): + case (BPF_JMP | EBPF_JNE | BPF_X): + case (BPF_JMP | BPF_JGT | BPF_X): + case (BPF_JMP | EBPF_JLT | BPF_X): + case (BPF_JMP | BPF_JGE | BPF_X): + case (BPF_JMP | EBPF_JLE | BPF_X): + case (BPF_JMP | EBPF_JSGT | BPF_X): + case (BPF_JMP | EBPF_JSLT | BPF_X): + case (BPF_JMP | EBPF_JSGE | BPF_X): + case (BPF_JMP | EBPF_JSLE | BPF_X): + case (BPF_JMP | BPF_JSET | BPF_X): + rc |= add_edge(bvf, node, i + ins->off + 1); + rc |= add_edge(bvf, node, i + 1); + bvf->nb_jcc_nodes++; + break; + case (BPF_JMP | BPF_JA): + rc |= add_edge(bvf, node, i + ins->off + 1); + break; + /* load 64 bit immediate value */ + case (BPF_LD | BPF_IMM | EBPF_DW): + rc |= add_edge(bvf, node, i + 2); + i++; + break; + default: + rc |= add_edge(bvf, node, i + 1); + break; + } + + bvf->nb_nodes++; + bvf->node_colour[WHITE]++; + } + + if (rc != 0) + return rc; + + dfs(bvf); + + RTE_BPF_LOG(DEBUG, "%s(%p) stats:\n" + "nb_nodes=%u;\n" + "nb_jcc_nodes=%u;\n" + "node_color={[WHITE]=%u, [GREY]=%u,, [BLACK]=%u};\n" + "edge_type={[UNKNOWN]=%u, [TREE]=%u, [BACK]=%u, [CROSS]=%u};\n", + __func__, bvf, + bvf->nb_nodes, + bvf->nb_jcc_nodes, + bvf->node_colour[WHITE], bvf->node_colour[GREY], + bvf->node_colour[BLACK], + bvf->edge_type[UNKNOWN_EDGE], bvf->edge_type[TREE_EDGE], + bvf->edge_type[BACK_EDGE], bvf->edge_type[CROSS_EDGE]); + + if (bvf->node_colour[BLACK] != bvf->nb_nodes) { + RTE_BPF_LOG(ERR, "%s(%p) unreachable instructions;\n", + __func__, bvf); + log_unreachable(bvf); + return -EINVAL; + } + + if (bvf->node_colour[GREY] != 0 || bvf->node_colour[WHITE] != 0 || + bvf->edge_type[UNKNOWN_EDGE] != 0) { + RTE_BPF_LOG(ERR, "%s(%p) DFS internal error;\n", + __func__, bvf); + return -EINVAL; + } + + if (bvf->edge_type[BACK_EDGE] != 0) { + RTE_BPF_LOG(ERR, "%s(%p) loops detected;\n", + __func__, bvf); + log_loop(bvf); + return -EINVAL; + } + + return 0; +} + +/* + * helper functions get/free eval states. + */ +static struct bpf_eval_state * +pull_eval_state(struct bpf_verifier *bvf) +{ + uint32_t n; + + n = bvf->evst_pool.cur; + if (n == bvf->evst_pool.num) + return NULL; + + bvf->evst_pool.cur = n + 1; + return bvf->evst_pool.ent + n; +} + +static void +push_eval_state(struct bpf_verifier *bvf) +{ + bvf->evst_pool.cur--; +} + +static void +evst_pool_fini(struct bpf_verifier *bvf) +{ + bvf->evst = NULL; + free(bvf->evst_pool.ent); + memset(&bvf->evst_pool, 0, sizeof(bvf->evst_pool)); +} + +static int +evst_pool_init(struct bpf_verifier *bvf) +{ + uint32_t n; + + n = bvf->nb_jcc_nodes + 1; + + bvf->evst_pool.ent = calloc(n, sizeof(bvf->evst_pool.ent[0])); + if (bvf->evst_pool.ent == NULL) + return -ENOMEM; + + bvf->evst_pool.num = n; + bvf->evst_pool.cur = 0; + + bvf->evst = pull_eval_state(bvf); + return 0; +} + +/* + * Save current eval state. + */ +static int +save_eval_state(struct bpf_verifier *bvf, struct inst_node *node) +{ + struct bpf_eval_state *st; + + /* get new eval_state for this node */ + st = pull_eval_state(bvf); + if (st == NULL) { + RTE_BPF_LOG(ERR, + "%s: internal error (out of space) at pc: %u\n", + __func__, get_node_idx(bvf, node)); + return -ENOMEM; + } + + /* make a copy of current state */ + memcpy(st, bvf->evst, sizeof(*st)); + + /* swap current state with new one */ + node->evst = bvf->evst; + bvf->evst = st; + + RTE_BPF_LOG(DEBUG, "%s(bvf=%p,node=%u) old/new states: %p/%p;\n", + __func__, bvf, get_node_idx(bvf, node), node->evst, bvf->evst); + + return 0; +} + +/* + * Restore previous eval state and mark current eval state as free. + */ +static void +restore_eval_state(struct bpf_verifier *bvf, struct inst_node *node) +{ + RTE_BPF_LOG(DEBUG, "%s(bvf=%p,node=%u) old/new states: %p/%p;\n", + __func__, bvf, get_node_idx(bvf, node), bvf->evst, node->evst); + + bvf->evst = node->evst; + node->evst = NULL; + push_eval_state(bvf); +} + +static void +log_eval_state(const struct bpf_verifier *bvf, const struct ebpf_insn *ins, + uint32_t pc, int32_t loglvl) +{ + const struct bpf_eval_state *st; + const struct bpf_reg_val *rv; + + rte_log(loglvl, rte_bpf_logtype, "%s(pc=%u):\n", __func__, pc); + + st = bvf->evst; + rv = st->rv + ins->dst_reg; + + rte_log(loglvl, rte_bpf_logtype, + "r%u={\n" + "\tv={type=%u, size=%zu},\n" + "\tmask=0x%" PRIx64 ",\n" + "\tu={min=0x%" PRIx64 ", max=0x%" PRIx64 "},\n" + "\ts={min=%" PRId64 ", max=%" PRId64 "},\n" + "};\n", + ins->dst_reg, + rv->v.type, rv->v.size, + rv->mask, + rv->u.min, rv->u.max, + rv->s.min, rv->s.max); +} + +/* + * Do second pass through CFG and try to evaluate instructions + * via each possible path. + * Right now evaluation functionality is quite limited. + * Still need to add extra checks for: + * - use/return uninitialized registers. + * - use uninitialized data from the stack. + * - memory boundaries violation. + */ +static int +evaluate(struct bpf_verifier *bvf) +{ + int32_t rc; + uint32_t idx, op; + const char *err; + const struct ebpf_insn *ins; + struct inst_node *next, *node; + + /* initial state of frame pointer */ + static const struct bpf_reg_val rvfp = { + .v = { + .type = RTE_BPF_ARG_PTR_STACK, + .size = MAX_BPF_STACK_SIZE, + }, + .mask = UINT64_MAX, + .u = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE}, + .s = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE}, + }; + + bvf->evst->rv[EBPF_REG_1].v = bvf->prm->prog_arg; + bvf->evst->rv[EBPF_REG_1].mask = UINT64_MAX; + if (bvf->prm->prog_arg.type == RTE_BPF_ARG_RAW) + eval_max_bound(bvf->evst->rv + EBPF_REG_1, UINT64_MAX); + + bvf->evst->rv[EBPF_REG_10] = rvfp; + + ins = bvf->prm->ins; + node = bvf->in; + next = node; + rc = 0; + + while (node != NULL && rc == 0) { + + /* + * current node evaluation, make sure we evaluate + * each node only once. + */ + if (next != NULL) { + + bvf->evin = node; + idx = get_node_idx(bvf, node); + op = ins[idx].code; + + /* for jcc node make a copy of evaluatoion state */ + if (node->nb_edge > 1) + rc |= save_eval_state(bvf, node); + + if (ins_chk[op].eval != NULL && rc == 0) { + err = ins_chk[op].eval(bvf, ins + idx); + if (err != NULL) { + RTE_BPF_LOG(ERR, "%s: %s at pc: %u\n", + __func__, err, idx); + rc = -EINVAL; + } + } + + log_eval_state(bvf, ins + idx, idx, RTE_LOG_DEBUG); + bvf->evin = NULL; + } + + /* proceed through CFG */ + next = get_next_node(bvf, node); + if (next != NULL) { + + /* proceed with next child */ + if (node->cur_edge == node->nb_edge && + node->evst != NULL) + restore_eval_state(bvf, node); + + next->prev_node = get_node_idx(bvf, node); + node = next; + } else { + /* + * finished with current node and all it's kids, + * proceed with parent + */ + node->cur_edge = 0; + node = get_prev_node(bvf, node); + + /* finished */ + if (node == bvf->in) + node = NULL; + } + } + + return rc; +} + +int +bpf_validate(struct rte_bpf *bpf) +{ + int32_t rc; + struct bpf_verifier bvf; + + /* check input argument type, don't allow mbuf ptr on 32-bit */ + if (bpf->prm.prog_arg.type != RTE_BPF_ARG_RAW && + bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR && + (sizeof(uint64_t) != sizeof(uintptr_t) || + bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) { + RTE_BPF_LOG(ERR, "%s: unsupported argument type\n", __func__); + return -ENOTSUP; + } + + memset(&bvf, 0, sizeof(bvf)); + bvf.prm = &bpf->prm; + bvf.in = calloc(bpf->prm.nb_ins, sizeof(bvf.in[0])); + if (bvf.in == NULL) + return -ENOMEM; + + rc = validate(&bvf); + + if (rc == 0) { + rc = evst_pool_init(&bvf); + if (rc == 0) + rc = evaluate(&bvf); + evst_pool_fini(&bvf); + } + + free(bvf.in); + + /* copy collected info */ + if (rc == 0) + bpf->stack_sz = bvf.stack_sz; + + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_bpf/meson.build b/src/spdk/dpdk/lib/librte_bpf/meson.build new file mode 100644 index 00000000..bc0cd78f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +allow_experimental_apis = true +sources = files('bpf.c', + 'bpf_exec.c', + 'bpf_load.c', + 'bpf_pkt.c', + 'bpf_validate.c') + +if arch_subdir == 'x86' and cc.sizeof('void *') == 8 + sources += files('bpf_jit_x86.c') +endif + +install_headers = files('bpf_def.h', + 'rte_bpf.h', + 'rte_bpf_ethdev.h') + +deps += ['mbuf', 'net', 'ethdev'] + +dep = cc.find_library('elf', required: false) +if dep.found() == true and cc.has_header('libelf.h', dependencies: dep) + sources += files('bpf_load_elf.c') + ext_deps += dep +endif diff --git a/src/spdk/dpdk/lib/librte_bpf/rte_bpf.h b/src/spdk/dpdk/lib/librte_bpf/rte_bpf.h new file mode 100644 index 00000000..ad62ef2c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/rte_bpf.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_BPF_H_ +#define _RTE_BPF_H_ + +/** + * @file rte_bpf.h + * @b EXPERIMENTAL: this API may change without prior notice + * + * RTE BPF support. + * librte_bpf provides a framework to load and execute eBPF bytecode + * inside user-space dpdk based applications. + * It supports basic set of features from eBPF spec + * (https://www.kernel.org/doc/Documentation/networking/filter.txt). + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Possible types for function/BPF program arguments. + */ +enum rte_bpf_arg_type { + RTE_BPF_ARG_UNDEF, /**< undefined */ + RTE_BPF_ARG_RAW, /**< scalar value */ + RTE_BPF_ARG_PTR = 0x10, /**< pointer to data buffer */ + RTE_BPF_ARG_PTR_MBUF, /**< pointer to rte_mbuf */ + RTE_BPF_ARG_PTR_STACK, +}; + +/** + * function argument information + */ +struct rte_bpf_arg { + enum rte_bpf_arg_type type; + /** + * for ptr type - max size of data buffer it points to + * for raw type - the size (in bytes) of the value + */ + size_t size; + size_t buf_size; + /**< for mbuf ptr type, max size of rte_mbuf data buffer */ +}; + +/** + * determine is argument a pointer + */ +#define RTE_BPF_ARG_PTR_TYPE(x) ((x) & RTE_BPF_ARG_PTR) + +/** + * Possible types for external symbols. + */ +enum rte_bpf_xtype { + RTE_BPF_XTYPE_FUNC, /**< function */ + RTE_BPF_XTYPE_VAR, /**< variable */ + RTE_BPF_XTYPE_NUM +}; + +/** + * Definition for external symbols available in the BPF program. + */ +struct rte_bpf_xsym { + const char *name; /**< name */ + enum rte_bpf_xtype type; /**< type */ + union { + struct { + uint64_t (*val)(uint64_t, uint64_t, uint64_t, + uint64_t, uint64_t); + uint32_t nb_args; + struct rte_bpf_arg args[EBPF_FUNC_MAX_ARGS]; + /**< Function arguments descriptions. */ + struct rte_bpf_arg ret; /**< function return value. */ + } func; + struct { + void *val; /**< actual memory location */ + struct rte_bpf_arg desc; /**< type, size, etc. */ + } var; /**< external variable */ + }; +}; + +/** + * Input parameters for loading eBPF code. + */ +struct rte_bpf_prm { + const struct ebpf_insn *ins; /**< array of eBPF instructions */ + uint32_t nb_ins; /**< number of instructions in ins */ + const struct rte_bpf_xsym *xsym; + /**< array of external symbols that eBPF code is allowed to reference */ + uint32_t nb_xsym; /**< number of elements in xsym */ + struct rte_bpf_arg prog_arg; /**< eBPF program input arg description */ +}; + +/** + * Information about compiled into native ISA eBPF code. + */ +struct rte_bpf_jit { + uint64_t (*func)(void *); /**< JIT-ed native code */ + size_t sz; /**< size of JIT-ed code */ +}; + +struct rte_bpf; + +/** + * De-allocate all memory used by this eBPF execution context. + * + * @param bpf + * BPF handle to destroy. + */ +void __rte_experimental +rte_bpf_destroy(struct rte_bpf *bpf); + +/** + * Create a new eBPF execution context and load given BPF code into it. + * + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @return + * BPF handle that is used in future BPF operations, + * or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + * - ENOMEM - can't reserve enough memory + */ +struct rte_bpf * __rte_experimental +rte_bpf_load(const struct rte_bpf_prm *prm); + +/** + * Create a new eBPF execution context and load BPF code from given ELF + * file into it. + * + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @param fname + * Pathname for a ELF file. + * @param sname + * Name of the executable section within the file to load. + * @return + * BPF handle that is used in future BPF operations, + * or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + * - ENOMEM - can't reserve enough memory + */ +struct rte_bpf * __rte_experimental +rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname, + const char *sname); +/** + * Execute given BPF bytecode. + * + * @param bpf + * handle for the BPF code to execute. + * @param ctx + * pointer to input context. + * @return + * BPF execution return value. + */ +uint64_t __rte_experimental +rte_bpf_exec(const struct rte_bpf *bpf, void *ctx); + +/** + * Execute given BPF bytecode over a set of input contexts. + * + * @param bpf + * handle for the BPF code to execute. + * @param ctx + * array of pointers to the input contexts. + * @param rc + * array of return values (one per input). + * @param num + * number of elements in ctx[] (and rc[]). + * @return + * number of successfully processed inputs. + */ +uint32_t __rte_experimental +rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[], + uint32_t num); + +/** + * Provide information about natively compield code for given BPF handle. + * + * @param bpf + * handle for the BPF code. + * @param jit + * pointer to the rte_bpf_jit structure to be filled with related data. + * @return + * - -EINVAL if the parameters are invalid. + * - Zero if operation completed successfully. + */ +int __rte_experimental +rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BPF_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bpf/rte_bpf_ethdev.h b/src/spdk/dpdk/lib/librte_bpf/rte_bpf_ethdev.h new file mode 100644 index 00000000..31731e7a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/rte_bpf_ethdev.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_BPF_ETHDEV_H_ +#define _RTE_BPF_ETHDEV_H_ + +/** + * @file rte_bpf_ethdev.h + * @b EXPERIMENTAL: this API may change without prior notice + * + * API to install BPF filter as RX/TX callbacks for eth devices. + * Note that right now: + * - it is not MT safe, i.e. it is not allowed to do load/unload for the + * same port/queue from different threads in parallel. + * - though it allows to do load/unload at runtime + * (while RX/TX is ongoing on given port/queue). + * - allows only one BPF program per port/queue, + * i.e. new load will replace previously loaded for that port/queue BPF program. + * Filter behaviour - if BPF program returns zero value for a given packet, + * then it will be dropped inside callback and no further processing + * on RX - it will be dropped inside callback and no further processing + * for that packet will happen. + * on TX - packet will remain unsent, and it is responsibility of the user + * to handle such situation (drop, try to send again, etc.). + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + RTE_BPF_ETH_F_NONE = 0, + RTE_BPF_ETH_F_JIT = 0x1, /*< use compiled into native ISA code */ +}; + +/** + * Unload previously loaded BPF program (if any) from given RX port/queue + * and remove appropriate RX port/queue callback. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the RX queue on the given port + */ +void __rte_experimental +rte_bpf_eth_rx_unload(uint16_t port, uint16_t queue); + +/** + * Unload previously loaded BPF program (if any) from given TX port/queue + * and remove appropriate TX port/queue callback. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the TX queue on the given port + */ +void __rte_experimental +rte_bpf_eth_tx_unload(uint16_t port, uint16_t queue); + +/** + * Load BPF program from the ELF file and install callback to execute it + * on given RX port/queue. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the RX queue on the given port + * @param fname + * Pathname for a ELF file. + * @param sname + * Name of the executable section within the file to load. + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @param flags + * Flags that define expected expected behavior of the loaded filter + * (i.e. jited/non-jited version to use). + * @return + * Zero on successful completion or negative error code otherwise. + */ +int __rte_experimental +rte_bpf_eth_rx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags); + +/** + * Load BPF program from the ELF file and install callback to execute it + * on given TX port/queue. + * + * @param port + * The identifier of the ethernet port + * @param queue + * The identifier of the TX queue on the given port + * @param fname + * Pathname for a ELF file. + * @param sname + * Name of the executable section within the file to load. + * @param prm + * Parameters used to create and initialise the BPF exeution context. + * @param flags + * Flags that define expected expected behavior of the loaded filter + * (i.e. jited/non-jited version to use). + * @return + * Zero on successful completion or negative error code otherwise. + */ +int __rte_experimental +rte_bpf_eth_tx_elf_load(uint16_t port, uint16_t queue, + const struct rte_bpf_prm *prm, const char *fname, const char *sname, + uint32_t flags); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BPF_ETHDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_bpf/rte_bpf_version.map b/src/spdk/dpdk/lib/librte_bpf/rte_bpf_version.map new file mode 100644 index 00000000..a203e088 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_bpf/rte_bpf_version.map @@ -0,0 +1,16 @@ +EXPERIMENTAL { + global: + + rte_bpf_destroy; + rte_bpf_elf_load; + rte_bpf_eth_rx_elf_load; + rte_bpf_eth_rx_unload; + rte_bpf_eth_tx_elf_load; + rte_bpf_eth_tx_unload; + rte_bpf_exec; + rte_bpf_exec_burst; + rte_bpf_get_jit; + rte_bpf_load; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_cfgfile/Makefile b/src/spdk/dpdk/lib/librte_cfgfile/Makefile new file mode 100644 index 00000000..d9512565 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cfgfile/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_cfgfile.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -I$(SRCDIR)/../librte_eal/common/include + +EXPORT_MAP := rte_cfgfile_version.map + +LIBABIVER := 2 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_CFGFILE) += rte_cfgfile.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_CFGFILE)-include += rte_cfgfile.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_cfgfile/meson.build b/src/spdk/dpdk/lib/librte_cfgfile/meson.build new file mode 100644 index 00000000..d97b38bc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cfgfile/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('rte_cfgfile.c') +headers = files('rte_cfgfile.h') diff --git a/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.c b/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.c new file mode 100644 index 00000000..7d8c941e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.c @@ -0,0 +1,555 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include "rte_cfgfile.h" + +struct rte_cfgfile_section { + char name[CFG_NAME_LEN]; + int num_entries; + int allocated_entries; + struct rte_cfgfile_entry *entries; +}; + +struct rte_cfgfile { + int flags; + int num_sections; + int allocated_sections; + struct rte_cfgfile_section *sections; +}; + +/** when we resize a file structure, how many extra entries + * for new sections do we add in */ +#define CFG_ALLOC_SECTION_BATCH 8 +/** when we resize a section structure, how many extra entries + * for new entries do we add in */ +#define CFG_ALLOC_ENTRY_BATCH 16 + +/** + * Default cfgfile load parameters. + */ +static const struct rte_cfgfile_parameters default_cfgfile_params = { + .comment_character = CFG_DEFAULT_COMMENT_CHARACTER, +}; + +/** + * Defines the list of acceptable comment characters supported by this + * library. + */ +static const char valid_comment_chars[] = { + '!', + '#', + '%', + ';', + '@' +}; + +static unsigned +_strip(char *str, unsigned len) +{ + int newlen = len; + if (len == 0) + return 0; + + if (isspace(str[len-1])) { + /* strip trailing whitespace */ + while (newlen > 0 && isspace(str[newlen - 1])) + str[--newlen] = '\0'; + } + + if (isspace(str[0])) { + /* strip leading whitespace */ + int i, start = 1; + while (isspace(str[start]) && start < newlen) + start++ + ; /* do nothing */ + newlen -= start; + for (i = 0; i < newlen; i++) + str[i] = str[i+start]; + str[i] = '\0'; + } + return newlen; +} + +static struct rte_cfgfile_section * +_get_section(struct rte_cfgfile *cfg, const char *sectionname) +{ + int i; + + for (i = 0; i < cfg->num_sections; i++) { + if (strncmp(cfg->sections[i].name, sectionname, + sizeof(cfg->sections[0].name)) == 0) + return &cfg->sections[i]; + } + return NULL; +} + +static int +_add_entry(struct rte_cfgfile_section *section, const char *entryname, + const char *entryvalue) +{ + /* resize entry structure if we don't have room for more entries */ + if (section->num_entries == section->allocated_entries) { + struct rte_cfgfile_entry *n_entries = realloc( + section->entries, + sizeof(struct rte_cfgfile_entry) * + ((section->allocated_entries) + + CFG_ALLOC_ENTRY_BATCH)); + + if (n_entries == NULL) + return -ENOMEM; + + section->entries = n_entries; + section->allocated_entries += CFG_ALLOC_ENTRY_BATCH; + } + /* fill up entry fields with key name and value */ + struct rte_cfgfile_entry *curr_entry = + §ion->entries[section->num_entries]; + + snprintf(curr_entry->name, sizeof(curr_entry->name), "%s", entryname); + snprintf(curr_entry->value, + sizeof(curr_entry->value), "%s", entryvalue); + section->num_entries++; + + return 0; +} + +static int +rte_cfgfile_check_params(const struct rte_cfgfile_parameters *params) +{ + unsigned int valid_comment; + unsigned int i; + + if (!params) { + printf("Error - missing cfgfile parameters\n"); + return -EINVAL; + } + + valid_comment = 0; + for (i = 0; i < RTE_DIM(valid_comment_chars); i++) { + if (params->comment_character == valid_comment_chars[i]) { + valid_comment = 1; + break; + } + } + + if (valid_comment == 0) { + printf("Error - invalid comment characters %c\n", + params->comment_character); + return -ENOTSUP; + } + + return 0; +} + +struct rte_cfgfile * +rte_cfgfile_load(const char *filename, int flags) +{ + return rte_cfgfile_load_with_params(filename, flags, + &default_cfgfile_params); +} + +struct rte_cfgfile * +rte_cfgfile_load_with_params(const char *filename, int flags, + const struct rte_cfgfile_parameters *params) +{ + char buffer[CFG_NAME_LEN + CFG_VALUE_LEN + 4] = {0}; + int lineno = 0; + struct rte_cfgfile *cfg = NULL; + + if (rte_cfgfile_check_params(params)) + return NULL; + + FILE *f = fopen(filename, "r"); + if (f == NULL) + return NULL; + + cfg = rte_cfgfile_create(flags); + + while (fgets(buffer, sizeof(buffer), f) != NULL) { + char *pos = NULL; + size_t len = strnlen(buffer, sizeof(buffer)); + lineno++; + if ((len >= sizeof(buffer) - 1) && (buffer[len-1] != '\n')) { + printf("Error line %d - no \\n found on string. " + "Check if line too long\n", lineno); + goto error1; + } + /* skip parsing if comment character found */ + pos = memchr(buffer, params->comment_character, len); + if (pos != NULL && (*(pos-1) != '\\')) { + *pos = '\0'; + len = pos - buffer; + } + + len = _strip(buffer, len); + /* skip lines without useful content */ + if (buffer[0] != '[' && memchr(buffer, '=', len) == NULL) + continue; + + if (buffer[0] == '[') { + /* section heading line */ + char *end = memchr(buffer, ']', len); + if (end == NULL) { + printf("Error line %d - no terminating ']'" + "character found\n", lineno); + goto error1; + } + *end = '\0'; + _strip(&buffer[1], end - &buffer[1]); + + rte_cfgfile_add_section(cfg, &buffer[1]); + } else { + /* key and value line */ + char *split[2] = {NULL}; + + split[0] = buffer; + split[1] = memchr(buffer, '=', len); + if (split[1] == NULL) { + printf("Error line %d - no '='" + "character found\n", lineno); + goto error1; + } + *split[1] = '\0'; + split[1]++; + + _strip(split[0], strlen(split[0])); + _strip(split[1], strlen(split[1])); + char *end = memchr(split[1], '\\', strlen(split[1])); + + while (end != NULL) { + if (*(end+1) == params->comment_character) { + *end = '\0'; + strcat(split[1], end+1); + } else + end++; + end = memchr(end, '\\', strlen(end)); + } + + if (!(flags & CFG_FLAG_EMPTY_VALUES) && + (*split[1] == '\0')) { + printf("Error at line %d - cannot use empty " + "values\n", lineno); + goto error1; + } + + if (cfg->num_sections == 0) + goto error1; + + _add_entry(&cfg->sections[cfg->num_sections - 1], + split[0], split[1]); + } + } + fclose(f); + return cfg; +error1: + rte_cfgfile_close(cfg); + fclose(f); + return NULL; +} + +struct rte_cfgfile * +rte_cfgfile_create(int flags) +{ + int i; + struct rte_cfgfile *cfg = NULL; + + cfg = malloc(sizeof(*cfg)); + + if (cfg == NULL) + return NULL; + + cfg->flags = flags; + cfg->num_sections = 0; + + /* allocate first batch of sections and entries */ + cfg->sections = malloc(sizeof(struct rte_cfgfile_section) * + CFG_ALLOC_SECTION_BATCH); + + if (cfg->sections == NULL) + goto error1; + + cfg->allocated_sections = CFG_ALLOC_SECTION_BATCH; + + for (i = 0; i < CFG_ALLOC_SECTION_BATCH; i++) { + cfg->sections[i].entries = malloc(sizeof( + struct rte_cfgfile_entry) * CFG_ALLOC_ENTRY_BATCH); + + if (cfg->sections[i].entries == NULL) + goto error1; + + cfg->sections[i].num_entries = 0; + cfg->sections[i].allocated_entries = CFG_ALLOC_ENTRY_BATCH; + } + + if (flags & CFG_FLAG_GLOBAL_SECTION) + rte_cfgfile_add_section(cfg, "GLOBAL"); + + return cfg; +error1: + if (cfg->sections != NULL) { + for (i = 0; i < cfg->allocated_sections; i++) { + if (cfg->sections[i].entries != NULL) { + free(cfg->sections[i].entries); + cfg->sections[i].entries = NULL; + } + } + free(cfg->sections); + cfg->sections = NULL; + } + free(cfg); + return NULL; +} + +int +rte_cfgfile_add_section(struct rte_cfgfile *cfg, const char *sectionname) +{ + int i; + + if (cfg == NULL) + return -EINVAL; + + if (sectionname == NULL) + return -EINVAL; + + /* resize overall struct if we don't have room for more sections */ + if (cfg->num_sections == cfg->allocated_sections) { + + struct rte_cfgfile_section *n_sections = + realloc(cfg->sections, + sizeof(struct rte_cfgfile_section) * + ((cfg->allocated_sections) + + CFG_ALLOC_SECTION_BATCH)); + + if (n_sections == NULL) + return -ENOMEM; + + for (i = 0; i < CFG_ALLOC_SECTION_BATCH; i++) { + n_sections[i + cfg->allocated_sections].num_entries = 0; + n_sections[i + + cfg->allocated_sections].allocated_entries = 0; + n_sections[i + cfg->allocated_sections].entries = NULL; + } + cfg->sections = n_sections; + cfg->allocated_sections += CFG_ALLOC_SECTION_BATCH; + } + + snprintf(cfg->sections[cfg->num_sections].name, + sizeof(cfg->sections[0].name), "%s", sectionname); + cfg->sections[cfg->num_sections].num_entries = 0; + cfg->num_sections++; + + return 0; +} + +int rte_cfgfile_add_entry(struct rte_cfgfile *cfg, + const char *sectionname, const char *entryname, + const char *entryvalue) +{ + int ret; + + if ((cfg == NULL) || (sectionname == NULL) || (entryname == NULL) + || (entryvalue == NULL)) + return -EINVAL; + + if (rte_cfgfile_has_entry(cfg, sectionname, entryname) != 0) + return -EEXIST; + + /* search for section pointer by sectionname */ + struct rte_cfgfile_section *curr_section = _get_section(cfg, + sectionname); + if (curr_section == NULL) + return -EINVAL; + + ret = _add_entry(curr_section, entryname, entryvalue); + + return ret; +} + +int rte_cfgfile_set_entry(struct rte_cfgfile *cfg, const char *sectionname, + const char *entryname, const char *entryvalue) +{ + int i; + + if ((cfg == NULL) || (sectionname == NULL) || (entryname == NULL)) + return -EINVAL; + + /* search for section pointer by sectionname */ + struct rte_cfgfile_section *curr_section = _get_section(cfg, + sectionname); + if (curr_section == NULL) + return -EINVAL; + + if (entryvalue == NULL) + entryvalue = ""; + + for (i = 0; i < curr_section->num_entries; i++) + if (!strcmp(curr_section->entries[i].name, entryname)) { + snprintf(curr_section->entries[i].value, + sizeof(curr_section->entries[i].value), + "%s", entryvalue); + return 0; + } + printf("Error - entry name doesn't exist\n"); + return -EINVAL; +} + +int rte_cfgfile_save(struct rte_cfgfile *cfg, const char *filename) +{ + int i, j; + + if ((cfg == NULL) || (filename == NULL)) + return -EINVAL; + + FILE *f = fopen(filename, "w"); + + if (f == NULL) + return -EINVAL; + + for (i = 0; i < cfg->num_sections; i++) { + fprintf(f, "[%s]\n", cfg->sections[i].name); + + for (j = 0; j < cfg->sections[i].num_entries; j++) { + fprintf(f, "%s=%s\n", + cfg->sections[i].entries[j].name, + cfg->sections[i].entries[j].value); + } + } + return fclose(f); +} + +int rte_cfgfile_close(struct rte_cfgfile *cfg) +{ + int i; + + if (cfg == NULL) + return -1; + + if (cfg->sections != NULL) { + for (i = 0; i < cfg->allocated_sections; i++) { + if (cfg->sections[i].entries != NULL) { + free(cfg->sections[i].entries); + cfg->sections[i].entries = NULL; + } + } + free(cfg->sections); + cfg->sections = NULL; + } + free(cfg); + cfg = NULL; + + return 0; +} + +int +rte_cfgfile_num_sections(struct rte_cfgfile *cfg, const char *sectionname, +size_t length) +{ + int i; + int num_sections = 0; + for (i = 0; i < cfg->num_sections; i++) { + if (strncmp(cfg->sections[i].name, sectionname, length) == 0) + num_sections++; + } + return num_sections; +} + +int +rte_cfgfile_sections(struct rte_cfgfile *cfg, char *sections[], + int max_sections) +{ + int i; + + for (i = 0; i < cfg->num_sections && i < max_sections; i++) + snprintf(sections[i], CFG_NAME_LEN, "%s", + cfg->sections[i].name); + + return i; +} + +int +rte_cfgfile_has_section(struct rte_cfgfile *cfg, const char *sectionname) +{ + return _get_section(cfg, sectionname) != NULL; +} + +int +rte_cfgfile_section_num_entries(struct rte_cfgfile *cfg, + const char *sectionname) +{ + const struct rte_cfgfile_section *s = _get_section(cfg, sectionname); + if (s == NULL) + return -1; + return s->num_entries; +} + +int +rte_cfgfile_section_num_entries_by_index(struct rte_cfgfile *cfg, + char *sectionname, int index) +{ + if (index < 0 || index >= cfg->num_sections) + return -1; + + const struct rte_cfgfile_section *sect = &(cfg->sections[index]); + + snprintf(sectionname, CFG_NAME_LEN, "%s", sect->name); + return sect->num_entries; +} +int +rte_cfgfile_section_entries(struct rte_cfgfile *cfg, const char *sectionname, + struct rte_cfgfile_entry *entries, int max_entries) +{ + int i; + const struct rte_cfgfile_section *sect = _get_section(cfg, sectionname); + if (sect == NULL) + return -1; + for (i = 0; i < max_entries && i < sect->num_entries; i++) + entries[i] = sect->entries[i]; + return i; +} + +int +rte_cfgfile_section_entries_by_index(struct rte_cfgfile *cfg, int index, + char *sectionname, + struct rte_cfgfile_entry *entries, int max_entries) +{ + int i; + const struct rte_cfgfile_section *sect; + + if (index < 0 || index >= cfg->num_sections) + return -1; + sect = &cfg->sections[index]; + snprintf(sectionname, CFG_NAME_LEN, "%s", sect->name); + for (i = 0; i < max_entries && i < sect->num_entries; i++) + entries[i] = sect->entries[i]; + return i; +} + +const char * +rte_cfgfile_get_entry(struct rte_cfgfile *cfg, const char *sectionname, + const char *entryname) +{ + int i; + const struct rte_cfgfile_section *sect = _get_section(cfg, sectionname); + if (sect == NULL) + return NULL; + for (i = 0; i < sect->num_entries; i++) + if (strncmp(sect->entries[i].name, entryname, CFG_NAME_LEN) + == 0) + return sect->entries[i].value; + return NULL; +} + +int +rte_cfgfile_has_entry(struct rte_cfgfile *cfg, const char *sectionname, + const char *entryname) +{ + return rte_cfgfile_get_entry(cfg, sectionname, entryname) != NULL; +} diff --git a/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.h b/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.h new file mode 100644 index 00000000..b2030fa6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile.h @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_CFGFILE_H__ +#define __INCLUDE_RTE_CFGFILE_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** +* @file +* RTE Configuration File +* +* This library allows reading application defined parameters from standard +* format configuration file. +* +***/ + +#ifndef CFG_NAME_LEN +#define CFG_NAME_LEN 64 +#endif + +#ifndef CFG_VALUE_LEN +#define CFG_VALUE_LEN 256 +#endif + +/** Configuration file */ +struct rte_cfgfile; + +/** Configuration file entry */ +struct rte_cfgfile_entry { + char name[CFG_NAME_LEN]; /**< Name */ + char value[CFG_VALUE_LEN]; /**< Value */ +}; + +/** Configuration file operation optional arguments */ +struct rte_cfgfile_parameters { + /** Config file comment character; one of '!', '#', '%', ';', '@' */ + char comment_character; +}; + +/**@{ cfgfile load operation flags */ +enum { + /** + * Indicates that the file supports key value entries before the first + * defined section. These entries can be accessed in the "GLOBAL" + * section. + */ + CFG_FLAG_GLOBAL_SECTION = 1, + + /** + * Indicates that file supports key value entries where the value can + * be zero length (e.g., "key="). + */ + CFG_FLAG_EMPTY_VALUES = 2, +}; +/**@} */ + +/** Defines the default comment character used for parsing config files. */ +#define CFG_DEFAULT_COMMENT_CHARACTER ';' + +/** +* Open config file +* +* @param filename +* Config file name +* @param flags +* Config file flags +* @return +* Handle to configuration file on success, NULL otherwise +*/ +struct rte_cfgfile *rte_cfgfile_load(const char *filename, int flags); + +/** + * Open config file with specified optional parameters. + * + * @param filename + * Config file name + * @param flags + * Config file flags + * @param params + * Additional configuration attributes. Must be configured with desired + * values prior to invoking this API. + * @return + * Handle to configuration file on success, NULL otherwise + */ +struct rte_cfgfile *rte_cfgfile_load_with_params(const char *filename, + int flags, const struct rte_cfgfile_parameters *params); + +/** + * Create new cfgfile instance with empty sections and entries + * + * @param flags + * - CFG_FLAG_GLOBAL_SECTION + * Indicates that the file supports key value entries before the first + * defined section. These entries can be accessed in the "GLOBAL" + * section. + * - CFG_FLAG_EMPTY_VALUES + * Indicates that file supports key value entries where the value can + * be zero length (e.g., "key="). + * @return + * Handle to cfgfile instance on success, NULL otherwise + */ +struct rte_cfgfile *rte_cfgfile_create(int flags); + +/** + * Add section in cfgfile instance. + * + * @param cfg + * Pointer to the cfgfile structure. + * @param sectionname + * Section name which will be add to cfgfile. + * @return + * 0 on success, -ENOMEM if can't add section + */ +int +rte_cfgfile_add_section(struct rte_cfgfile *cfg, const char *sectionname); + +/** + * Add entry to specified section in cfgfile instance. + * + * @param cfg + * Pointer to the cfgfile structure. + * @param sectionname + * Given section name to add an entry. + * @param entryname + * Entry name to add. + * @param entryvalue + * Entry value to add. + * @return + * 0 on success, -EEXIST if entry already exist, -EINVAL if bad argument + */ +int rte_cfgfile_add_entry(struct rte_cfgfile *cfg, + const char *sectionname, const char *entryname, + const char *entryvalue); + +/** + * Update value of specified entry name in given section in config file + * + * @param cfg + * Config file + * @param sectionname + * Section name + * @param entryname + * Entry name to look for the value change + * @param entryvalue + * New entry value. Can be also an empty string if CFG_FLAG_EMPTY_VALUES = 1 + * @return + * 0 on success, -EINVAL if bad argument + */ +int rte_cfgfile_set_entry(struct rte_cfgfile *cfg, const char *sectionname, + const char *entryname, const char *entryvalue); + +/** + * Save object cfgfile to file on disc + * + * @param cfg + * Config file structure + * @param filename + * File name to save data + * @return + * 0 on success, errno otherwise + */ +int rte_cfgfile_save(struct rte_cfgfile *cfg, const char *filename); + +/** +* Get number of sections in config file +* +* @param cfg +* Config file +* @param sec_name +* Section name +* @param length +* Maximum section name length +* @return +* Number of sections +*/ +int rte_cfgfile_num_sections(struct rte_cfgfile *cfg, const char *sec_name, + size_t length); + +/** +* Get name of all config file sections. +* +* Fills in the array sections with the name of all the sections in the file +* (up to the number of max_sections sections). +* +* @param cfg +* Config file +* @param sections +* Array containing section names after successful invocation. Each element +* of this array should be preallocated by the user with at least +* CFG_NAME_LEN characters. +* @param max_sections +* Maximum number of section names to be stored in sections array +* @return +* Number of populated sections names +*/ +int rte_cfgfile_sections(struct rte_cfgfile *cfg, char *sections[], + int max_sections); + +/** +* Check if given section exists in config file +* +* @param cfg +* Config file +* @param sectionname +* Section name +* @return +* TRUE (value different than 0) if section exists, FALSE (value 0) otherwise +*/ +int rte_cfgfile_has_section(struct rte_cfgfile *cfg, const char *sectionname); + +/** +* Get number of entries in given config file section +* +* If multiple sections have the given name this function operates on the +* first one. +* +* @param cfg +* Config file +* @param sectionname +* Section name +* @return +* Number of entries in section on success, -1 otherwise +*/ +int rte_cfgfile_section_num_entries(struct rte_cfgfile *cfg, + const char *sectionname); + +/** +* Get number of entries in given config file section +* +* The index of a section is the same as the index of its name in the +* result of rte_cfgfile_sections. This API can be used when there are +* multiple sections with the same name. +* +* @param cfg +* Config file +* @param sectionname +* Section name +* @param index +* Section index +* @return +* Number of entries in section on success, -1 otherwise +*/ +int rte_cfgfile_section_num_entries_by_index(struct rte_cfgfile *cfg, + char *sectionname, + int index); + +/** +* Get section entries as key-value pairs +* +* If multiple sections have the given name this function operates on the +* first one. +* +* @param cfg +* Config file +* @param sectionname +* Section name +* @param entries +* Pre-allocated array of at least max_entries entries where the section +* entries are stored as key-value pair after successful invocation +* @param max_entries +* Maximum number of section entries to be stored in entries array +* @return +* Number of entries populated on success, -1 otherwise +*/ +int rte_cfgfile_section_entries(struct rte_cfgfile *cfg, + const char *sectionname, + struct rte_cfgfile_entry *entries, + int max_entries); + +/** +* Get section entries as key-value pairs +* +* The index of a section is the same as the index of its name in the +* result of rte_cfgfile_sections. This API can be used when there are +* multiple sections with the same name. +* +* @param cfg +* Config file +* @param index +* Section index +* @param sectionname +* Pre-allocated string of at least CFG_NAME_LEN characters where the +* section name is stored after successful invocation. +* @param entries +* Pre-allocated array of at least max_entries entries where the section +* entries are stored as key-value pair after successful invocation +* @param max_entries +* Maximum number of section entries to be stored in entries array +* @return +* Number of entries populated on success, -1 otherwise +*/ +int rte_cfgfile_section_entries_by_index(struct rte_cfgfile *cfg, + int index, + char *sectionname, + struct rte_cfgfile_entry *entries, + int max_entries); + +/** +* Get value of the named entry in named config file section +* +* If multiple sections have the given name this function operates on the +* first one. +* +* @param cfg +* Config file +* @param sectionname +* Section name +* @param entryname +* Entry name +* @return +* Entry value on success, NULL otherwise +*/ +const char *rte_cfgfile_get_entry(struct rte_cfgfile *cfg, + const char *sectionname, + const char *entryname); + +/** +* Check if given entry exists in named config file section +* +* If multiple sections have the given name this function operates on the +* first one. +* +* @param cfg +* Config file +* @param sectionname +* Section name +* @param entryname +* Entry name +* @return +* TRUE (value different than 0) if entry exists, FALSE (value 0) otherwise +*/ +int rte_cfgfile_has_entry(struct rte_cfgfile *cfg, const char *sectionname, + const char *entryname); + +/** +* Close config file +* +* @param cfg +* Config file +* @return +* 0 on success, -1 otherwise +*/ +int rte_cfgfile_close(struct rte_cfgfile *cfg); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map b/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map new file mode 100644 index 00000000..cc4a11f6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map @@ -0,0 +1,40 @@ +DPDK_2.0 { + global: + + rte_cfgfile_close; + rte_cfgfile_get_entry; + rte_cfgfile_has_entry; + rte_cfgfile_has_section; + rte_cfgfile_load; + rte_cfgfile_num_sections; + rte_cfgfile_section_entries; + rte_cfgfile_section_num_entries; + rte_cfgfile_sections; + + local: *; +}; + +DPDK_16.04 { + global: + + rte_cfgfile_section_entries_by_index; + +} DPDK_2.0; + +DPDK_17.05 { + global: + + rte_cfgfile_load_with_params; + +} DPDK_16.04; + +DPDK_17.11 { + global: + + rte_cfgfile_add_entry; + rte_cfgfile_add_section; + rte_cfgfile_create; + rte_cfgfile_save; + rte_cfgfile_set_entry; + +} DPDK_17.05; diff --git a/src/spdk/dpdk/lib/librte_cmdline/Makefile b/src/spdk/dpdk/lib/librte_cmdline/Makefile new file mode 100644 index 00000000..ddae1cfd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/Makefile @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_cmdline.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 + +EXPORT_MAP := rte_cmdline_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) := cmdline.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_cirbuf.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_etheraddr.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_ipaddr.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_num.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_string.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_rdline.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_vt100.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_socket.c +SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_portlist.c + +CFLAGS += -D_GNU_SOURCE +LDLIBS += -lrte_eal + +# install includes +INCS := cmdline.h cmdline_parse.h cmdline_parse_num.h cmdline_parse_ipaddr.h +INCS += cmdline_parse_etheraddr.h cmdline_parse_string.h cmdline_rdline.h +INCS += cmdline_vt100.h cmdline_socket.h cmdline_cirbuf.h cmdline_parse_portlist.h +SYMLINK-$(CONFIG_RTE_LIBRTE_CMDLINE)-include := $(INCS) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline.c new file mode 100644 index 00000000..591b78b0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline.c @@ -0,0 +1,254 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cmdline_parse.h" +#include "cmdline_rdline.h" +#include "cmdline.h" + +static void +cmdline_valid_buffer(struct rdline *rdl, const char *buf, + __attribute__((unused)) unsigned int size) +{ + struct cmdline *cl = rdl->opaque; + int ret; + ret = cmdline_parse(cl, buf); + if (ret == CMDLINE_PARSE_AMBIGUOUS) + cmdline_printf(cl, "Ambiguous command\n"); + else if (ret == CMDLINE_PARSE_NOMATCH) + cmdline_printf(cl, "Command not found\n"); + else if (ret == CMDLINE_PARSE_BAD_ARGS) + cmdline_printf(cl, "Bad arguments\n"); +} + +static int +cmdline_complete_buffer(struct rdline *rdl, const char *buf, + char *dstbuf, unsigned int dstsize, + int *state) +{ + struct cmdline *cl = rdl->opaque; + return cmdline_complete(cl, buf, state, dstbuf, dstsize); +} + +int +cmdline_write_char(struct rdline *rdl, char c) +{ + int ret = -1; + struct cmdline *cl; + + if (!rdl) + return -1; + + cl = rdl->opaque; + + if (cl->s_out >= 0) + ret = write(cl->s_out, &c, 1); + + return ret; +} + + +void +cmdline_set_prompt(struct cmdline *cl, const char *prompt) +{ + if (!cl || !prompt) + return; + snprintf(cl->prompt, sizeof(cl->prompt), "%s", prompt); +} + +struct cmdline * +cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out) +{ + struct cmdline *cl; + int ret; + + if (!ctx || !prompt) + return NULL; + + cl = malloc(sizeof(struct cmdline)); + if (cl == NULL) + return NULL; + memset(cl, 0, sizeof(struct cmdline)); + cl->s_in = s_in; + cl->s_out = s_out; + cl->ctx = ctx; + + ret = rdline_init(&cl->rdl, cmdline_write_char, cmdline_valid_buffer, + cmdline_complete_buffer); + if (ret != 0) { + free(cl); + return NULL; + } + + cl->rdl.opaque = cl; + cmdline_set_prompt(cl, prompt); + rdline_newline(&cl->rdl, cl->prompt); + + return cl; +} + +void +cmdline_free(struct cmdline *cl) +{ + dprintf("called\n"); + + if (!cl) + return; + + if (cl->s_in > 2) + close(cl->s_in); + if (cl->s_out != cl->s_in && cl->s_out > 2) + close(cl->s_out); + free(cl); +} + +void +cmdline_printf(const struct cmdline *cl, const char *fmt, ...) +{ + va_list ap; + + if (!cl || !fmt) + return; + +#ifdef _GNU_SOURCE + if (cl->s_out < 0) + return; + va_start(ap, fmt); + vdprintf(cl->s_out, fmt, ap); + va_end(ap); +#else + int ret; + char *buf; + + if (cl->s_out < 0) + return; + + buf = malloc(BUFSIZ); + if (buf == NULL) + return; + va_start(ap, fmt); + ret = vsnprintf(buf, BUFSIZ, fmt, ap); + va_end(ap); + if (ret < 0) { + free(buf); + return; + } + if (ret >= BUFSIZ) + ret = BUFSIZ - 1; + ret = write(cl->s_out, buf, ret); + (void)ret; + free(buf); +#endif +} + +int +cmdline_in(struct cmdline *cl, const char *buf, int size) +{ + const char *history, *buffer; + size_t histlen, buflen; + int ret = 0; + int i, same; + + if (!cl || !buf) + return -1; + + for (i=0; irdl, buf[i]); + + if (ret == RDLINE_RES_VALIDATED) { + buffer = rdline_get_buffer(&cl->rdl); + history = rdline_get_history_item(&cl->rdl, 0); + if (history) { + histlen = strnlen(history, RDLINE_BUF_SIZE); + same = !memcmp(buffer, history, histlen) && + buffer[histlen] == '\n'; + } + else + same = 0; + buflen = strnlen(buffer, RDLINE_BUF_SIZE); + if (buflen > 1 && !same) + rdline_add_history(&cl->rdl, buffer); + rdline_newline(&cl->rdl, cl->prompt); + } + else if (ret == RDLINE_RES_EOF) + return -1; + else if (ret == RDLINE_RES_EXITED) + return -1; + } + return i; +} + +void +cmdline_quit(struct cmdline *cl) +{ + if (!cl) + return; + rdline_quit(&cl->rdl); +} + +int +cmdline_poll(struct cmdline *cl) +{ + struct pollfd pfd; + int status; + ssize_t read_status; + char c; + + if (!cl) + return -EINVAL; + else if (cl->rdl.status == RDLINE_EXITED) + return RDLINE_EXITED; + + pfd.fd = cl->s_in; + pfd.events = POLLIN; + pfd.revents = 0; + + status = poll(&pfd, 1, 0); + if (status < 0) + return status; + else if (status > 0) { + c = -1; + read_status = read(cl->s_in, &c, 1); + if (read_status < 0) + return read_status; + + status = cmdline_in(cl, &c, 1); + if (status < 0 && cl->rdl.status != RDLINE_EXITED) + return status; + } + + return cl->rdl.status; +} + +void +cmdline_interact(struct cmdline *cl) +{ + char c; + + if (!cl) + return; + + c = -1; + while (1) { + if (read(cl->s_in, &c, 1) <= 0) + break; + if (cmdline_in(cl, &c, 1) < 0) + break; + } +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline.h new file mode 100644 index 00000000..27d2effd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _CMDLINE_H_ +#define _CMDLINE_H_ + +#include +#include +#include + +/** + * @file + * + * Command line API + */ + +#ifdef __cplusplus +extern "C" { +#endif + +struct cmdline { + int s_in; + int s_out; + cmdline_parse_ctx_t *ctx; + struct rdline rdl; + char prompt[RDLINE_PROMPT_SIZE]; + struct termios oldterm; +}; + +struct cmdline *cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out); +void cmdline_set_prompt(struct cmdline *cl, const char *prompt); +void cmdline_free(struct cmdline *cl); +void cmdline_printf(const struct cmdline *cl, const char *fmt, ...) + __attribute__((format(printf,2,3))); +int cmdline_in(struct cmdline *cl, const char *buf, int size); +int cmdline_write_char(struct rdline *rdl, char c); + +/** + * This function is nonblocking equivalent of ``cmdline_interact()``. It polls + * *cl* for one character and interpret it. If return value is *RDLINE_EXITED* + * it mean that ``cmdline_quit()`` was invoked. + * + * @param cl + * The command line object. + * + * @return + * On success return object status - one of *enum rdline_status*. + * On error return negative value. + */ +int cmdline_poll(struct cmdline *cl); + +void cmdline_interact(struct cmdline *cl); +void cmdline_quit(struct cmdline *cl); + +#ifdef __cplusplus +} +#endif + +#endif /* _CMDLINE_SOCKET_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.c new file mode 100644 index 00000000..829a8af5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.c @@ -0,0 +1,412 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include + +#include "cmdline_cirbuf.h" + + +int +cirbuf_init(struct cirbuf *cbuf, char *buf, unsigned int start, unsigned int maxlen) +{ + if (!cbuf || !buf) + return -EINVAL; + cbuf->maxlen = maxlen; + cbuf->len = 0; + cbuf->start = start; + cbuf->end = start; + cbuf->buf = buf; + return 0; +} + +/* multiple add */ + +int +cirbuf_add_buf_head(struct cirbuf *cbuf, const char *c, unsigned int n) +{ + unsigned int e; + + if (!cbuf || !c || !n || n > CIRBUF_GET_FREELEN(cbuf)) + return -EINVAL; + + e = CIRBUF_IS_EMPTY(cbuf) ? 1 : 0; + + if (n < cbuf->start + e) { + dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->start - n + e, n); + memcpy(cbuf->buf + cbuf->start - n + e, c, n); + } + else { + dprintf("s[%d] -> d[%d] (%d)\n", + n - (cbuf->start + e), 0, + cbuf->start + e); + dprintf("s[%d] -> d[%d] (%d)\n", cbuf->maxlen - n + + (cbuf->start + e), 0, n - (cbuf->start + e)); + memcpy(cbuf->buf, c + n - (cbuf->start + e) , cbuf->start + e); + memcpy(cbuf->buf + cbuf->maxlen - n + (cbuf->start + e), c, + n - (cbuf->start + e)); + } + cbuf->len += n; + cbuf->start += (cbuf->maxlen - n + e); + cbuf->start %= cbuf->maxlen; + return n; +} + +/* multiple add */ + +int +cirbuf_add_buf_tail(struct cirbuf *cbuf, const char *c, unsigned int n) +{ + unsigned int e; + + if (!cbuf || !c || !n || n > CIRBUF_GET_FREELEN(cbuf)) + return -EINVAL; + + e = CIRBUF_IS_EMPTY(cbuf) ? 1 : 0; + + if (n < cbuf->maxlen - cbuf->end - 1 + e) { + dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->end + !e, n); + memcpy(cbuf->buf + cbuf->end + !e, c, n); + } + else { + dprintf("s[%d] -> d[%d] (%d)\n", cbuf->end + !e, 0, + cbuf->maxlen - cbuf->end - 1 + e); + dprintf("s[%d] -> d[%d] (%d)\n", cbuf->maxlen - cbuf->end - 1 + + e, 0, n - cbuf->maxlen + cbuf->end + 1 - e); + memcpy(cbuf->buf + cbuf->end + !e, c, cbuf->maxlen - + cbuf->end - 1 + e); + memcpy(cbuf->buf, c + cbuf->maxlen - cbuf->end - 1 + e, + n - cbuf->maxlen + cbuf->end + 1 - e); + } + cbuf->len += n; + cbuf->end += n - e; + cbuf->end %= cbuf->maxlen; + return n; +} + +/* add at head */ + +static inline void +__cirbuf_add_head(struct cirbuf * cbuf, char c) +{ + if (!CIRBUF_IS_EMPTY(cbuf)) { + cbuf->start += (cbuf->maxlen - 1); + cbuf->start %= cbuf->maxlen; + } + cbuf->buf[cbuf->start] = c; + cbuf->len ++; +} + +int +cirbuf_add_head_safe(struct cirbuf * cbuf, char c) +{ + if (cbuf && !CIRBUF_IS_FULL(cbuf)) { + __cirbuf_add_head(cbuf, c); + return 0; + } + return -EINVAL; +} + +void +cirbuf_add_head(struct cirbuf * cbuf, char c) +{ + __cirbuf_add_head(cbuf, c); +} + +/* add at tail */ + +static inline void +__cirbuf_add_tail(struct cirbuf * cbuf, char c) +{ + if (!CIRBUF_IS_EMPTY(cbuf)) { + cbuf->end ++; + cbuf->end %= cbuf->maxlen; + } + cbuf->buf[cbuf->end] = c; + cbuf->len ++; +} + +int +cirbuf_add_tail_safe(struct cirbuf * cbuf, char c) +{ + if (cbuf && !CIRBUF_IS_FULL(cbuf)) { + __cirbuf_add_tail(cbuf, c); + return 0; + } + return -EINVAL; +} + +void +cirbuf_add_tail(struct cirbuf * cbuf, char c) +{ + __cirbuf_add_tail(cbuf, c); +} + + +static inline void +__cirbuf_shift_left(struct cirbuf *cbuf) +{ + unsigned int i; + char tmp = cbuf->buf[cbuf->start]; + + for (i=0 ; ilen ; i++) { + cbuf->buf[(cbuf->start+i)%cbuf->maxlen] = + cbuf->buf[(cbuf->start+i+1)%cbuf->maxlen]; + } + cbuf->buf[(cbuf->start-1+cbuf->maxlen)%cbuf->maxlen] = tmp; + cbuf->start += (cbuf->maxlen - 1); + cbuf->start %= cbuf->maxlen; + cbuf->end += (cbuf->maxlen - 1); + cbuf->end %= cbuf->maxlen; +} + +static inline void +__cirbuf_shift_right(struct cirbuf *cbuf) +{ + unsigned int i; + char tmp = cbuf->buf[cbuf->end]; + + for (i=0 ; ilen ; i++) { + cbuf->buf[(cbuf->end+cbuf->maxlen-i)%cbuf->maxlen] = + cbuf->buf[(cbuf->end+cbuf->maxlen-i-1)%cbuf->maxlen]; + } + cbuf->buf[(cbuf->end+1)%cbuf->maxlen] = tmp; + cbuf->start += 1; + cbuf->start %= cbuf->maxlen; + cbuf->end += 1; + cbuf->end %= cbuf->maxlen; +} + +/* XXX we could do a better algorithm here... */ +int +cirbuf_align_left(struct cirbuf * cbuf) +{ + if (!cbuf) + return -EINVAL; + + if (cbuf->start < cbuf->maxlen/2) { + while (cbuf->start != 0) { + __cirbuf_shift_left(cbuf); + } + } + else { + while (cbuf->start != 0) { + __cirbuf_shift_right(cbuf); + } + } + + return 0; +} + +/* XXX we could do a better algorithm here... */ +int +cirbuf_align_right(struct cirbuf * cbuf) +{ + if (!cbuf) + return -EINVAL; + + if (cbuf->start >= cbuf->maxlen/2) { + while (cbuf->end != cbuf->maxlen-1) { + __cirbuf_shift_left(cbuf); + } + } + else { + while (cbuf->start != cbuf->maxlen-1) { + __cirbuf_shift_right(cbuf); + } + } + + return 0; +} + +/* buffer del */ + +int +cirbuf_del_buf_head(struct cirbuf *cbuf, unsigned int size) +{ + if (!cbuf || !size || size > CIRBUF_GET_LEN(cbuf)) + return -EINVAL; + + cbuf->len -= size; + if (CIRBUF_IS_EMPTY(cbuf)) { + cbuf->start += size - 1; + cbuf->start %= cbuf->maxlen; + } + else { + cbuf->start += size; + cbuf->start %= cbuf->maxlen; + } + return 0; +} + +/* buffer del */ + +int +cirbuf_del_buf_tail(struct cirbuf *cbuf, unsigned int size) +{ + if (!cbuf || !size || size > CIRBUF_GET_LEN(cbuf)) + return -EINVAL; + + cbuf->len -= size; + if (CIRBUF_IS_EMPTY(cbuf)) { + cbuf->end += (cbuf->maxlen - size + 1); + cbuf->end %= cbuf->maxlen; + } + else { + cbuf->end += (cbuf->maxlen - size); + cbuf->end %= cbuf->maxlen; + } + return 0; +} + +/* del at head */ + +static inline void +__cirbuf_del_head(struct cirbuf * cbuf) +{ + cbuf->len --; + if (!CIRBUF_IS_EMPTY(cbuf)) { + cbuf->start ++; + cbuf->start %= cbuf->maxlen; + } +} + +int +cirbuf_del_head_safe(struct cirbuf * cbuf) +{ + if (cbuf && !CIRBUF_IS_EMPTY(cbuf)) { + __cirbuf_del_head(cbuf); + return 0; + } + return -EINVAL; +} + +void +cirbuf_del_head(struct cirbuf * cbuf) +{ + __cirbuf_del_head(cbuf); +} + +/* del at tail */ + +static inline void +__cirbuf_del_tail(struct cirbuf * cbuf) +{ + cbuf->len --; + if (!CIRBUF_IS_EMPTY(cbuf)) { + cbuf->end += (cbuf->maxlen - 1); + cbuf->end %= cbuf->maxlen; + } +} + +int +cirbuf_del_tail_safe(struct cirbuf * cbuf) +{ + if (cbuf && !CIRBUF_IS_EMPTY(cbuf)) { + __cirbuf_del_tail(cbuf); + return 0; + } + return -EINVAL; +} + +void +cirbuf_del_tail(struct cirbuf * cbuf) +{ + __cirbuf_del_tail(cbuf); +} + +/* convert to buffer */ + +int +cirbuf_get_buf_head(struct cirbuf *cbuf, char *c, unsigned int size) +{ + unsigned int n; + + if (!cbuf || !c) + return -EINVAL; + + n = (size < CIRBUF_GET_LEN(cbuf)) ? size : CIRBUF_GET_LEN(cbuf); + + if (!n) + return 0; + + if (cbuf->start <= cbuf->end) { + dprintf("s[%d] -> d[%d] (%d)\n", cbuf->start, 0, n); + memcpy(c, cbuf->buf + cbuf->start , n); + } + else { + /* check if we need to go from end to the beginning */ + if (n <= cbuf->maxlen - cbuf->start) { + dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->start, n); + memcpy(c, cbuf->buf + cbuf->start , n); + } + else { + dprintf("s[%d] -> d[%d] (%d)\n", cbuf->start, 0, + cbuf->maxlen - cbuf->start); + dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->maxlen - cbuf->start, + n - cbuf->maxlen + cbuf->start); + memcpy(c, cbuf->buf + cbuf->start , cbuf->maxlen - cbuf->start); + memcpy(c + cbuf->maxlen - cbuf->start, cbuf->buf, + n - cbuf->maxlen + cbuf->start); + } + } + return n; +} + +/* convert to buffer */ + +int +cirbuf_get_buf_tail(struct cirbuf *cbuf, char *c, unsigned int size) +{ + unsigned int n; + + if (!cbuf || !c) + return -EINVAL; + + n = (size < CIRBUF_GET_LEN(cbuf)) ? size : CIRBUF_GET_LEN(cbuf); + + if (!n) + return 0; + + if (cbuf->start <= cbuf->end) { + dprintf("s[%d] -> d[%d] (%d)\n", cbuf->end - n + 1, 0, n); + memcpy(c, cbuf->buf + cbuf->end - n + 1, n); + } + else { + /* check if we need to go from end to the beginning */ + if (n <= cbuf->end + 1) { + dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->end - n + 1, n); + memcpy(c, cbuf->buf + cbuf->end - n + 1, n); + } + else { + dprintf("s[%d] -> d[%d] (%d)\n", 0, + cbuf->maxlen - cbuf->start, cbuf->end + 1); + dprintf("s[%d] -> d[%d] (%d)\n", + cbuf->maxlen - n + cbuf->end + 1, 0, n - cbuf->end - 1); + memcpy(c + cbuf->maxlen - cbuf->start, + cbuf->buf, cbuf->end + 1); + memcpy(c, cbuf->buf + cbuf->maxlen - n + cbuf->end +1, + n - cbuf->end - 1); + } + } + return n; +} + +/* get head or get tail */ + +char +cirbuf_get_head(struct cirbuf * cbuf) +{ + return cbuf->buf[cbuf->start]; +} + +/* get head or get tail */ + +char +cirbuf_get_tail(struct cirbuf * cbuf) +{ + return cbuf->buf[cbuf->end]; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.h new file mode 100644 index 00000000..c23b211a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_cirbuf.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _CIRBUF_H_ +#define _CIRBUF_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * This structure is the header of a cirbuf type. + */ +struct cirbuf { + unsigned int maxlen; /**< total len of the fifo (number of elements) */ + unsigned int start; /**< indice of the first elt */ + unsigned int end; /**< indice of the last elt */ + unsigned int len; /**< current len of fifo */ + char *buf; +}; + +#ifdef RTE_LIBRTE_CMDLINE_DEBUG +#define dprintf_(fmt, ...) printf("line %3.3d - " fmt "%.0s", __LINE__, __VA_ARGS__) +#define dprintf(...) dprintf_(__VA_ARGS__, "dummy") +#else +#define dprintf(...) (void)0 +#endif + + +/** + * Init the circular buffer + */ +int cirbuf_init(struct cirbuf *cbuf, char *buf, unsigned int start, unsigned int maxlen); + + +/** + * Return 1 if the circular buffer is full + */ +#define CIRBUF_IS_FULL(cirbuf) ((cirbuf)->maxlen == (cirbuf)->len) + +/** + * Return 1 if the circular buffer is empty + */ +#define CIRBUF_IS_EMPTY(cirbuf) ((cirbuf)->len == 0) + +/** + * return current size of the circular buffer (number of used elements) + */ +#define CIRBUF_GET_LEN(cirbuf) ((cirbuf)->len) + +/** + * return size of the circular buffer (used + free elements) + */ +#define CIRBUF_GET_MAXLEN(cirbuf) ((cirbuf)->maxlen) + +/** + * return the number of free elts + */ +#define CIRBUF_GET_FREELEN(cirbuf) ((cirbuf)->maxlen - (cirbuf)->len) + +/** + * Iterator for a circular buffer + * c: struct cirbuf pointer + * i: an integer type internally used in the macro + * e: char that takes the value for each iteration + */ +#define CIRBUF_FOREACH(c, i, e) \ + for ( i=0, e=(c)->buf[(c)->start] ; \ + i<((c)->len) ; \ + i ++, e=(c)->buf[((c)->start+i)%((c)->maxlen)]) + + +/** + * Add a character at head of the circular buffer. Return 0 on success, or + * a negative value on error. + */ +int cirbuf_add_head_safe(struct cirbuf *cbuf, char c); + +/** + * Add a character at head of the circular buffer. You _must_ check that you + * have enough free space in the buffer before calling this func. + */ +void cirbuf_add_head(struct cirbuf *cbuf, char c); + +/** + * Add a character at tail of the circular buffer. Return 0 on success, or + * a negative value on error. + */ +int cirbuf_add_tail_safe(struct cirbuf *cbuf, char c); + +/** + * Add a character at tail of the circular buffer. You _must_ check that you + * have enough free space in the buffer before calling this func. + */ +void cirbuf_add_tail(struct cirbuf *cbuf, char c); + +/** + * Remove a char at the head of the circular buffer. Return 0 on + * success, or a negative value on error. + */ +int cirbuf_del_head_safe(struct cirbuf *cbuf); + +/** + * Remove a char at the head of the circular buffer. You _must_ check + * that buffer is not empty before calling the function. + */ +void cirbuf_del_head(struct cirbuf *cbuf); + +/** + * Remove a char at the tail of the circular buffer. Return 0 on + * success, or a negative value on error. + */ +int cirbuf_del_tail_safe(struct cirbuf *cbuf); + +/** + * Remove a char at the tail of the circular buffer. You _must_ check + * that buffer is not empty before calling the function. + */ +void cirbuf_del_tail(struct cirbuf *cbuf); + +/** + * Return the head of the circular buffer. You _must_ check that + * buffer is not empty before calling the function. + */ +char cirbuf_get_head(struct cirbuf *cbuf); + +/** + * Return the tail of the circular buffer. You _must_ check that + * buffer is not empty before calling the function. + */ +char cirbuf_get_tail(struct cirbuf *cbuf); + +/** + * Add a buffer at head of the circular buffer. 'c' is a pointer to a + * buffer, and n is the number of char to add. Return the number of + * copied bytes on success, or a negative value on error. + */ +int cirbuf_add_buf_head(struct cirbuf *cbuf, const char *c, unsigned int n); + +/** + * Add a buffer at tail of the circular buffer. 'c' is a pointer to a + * buffer, and n is the number of char to add. Return the number of + * copied bytes on success, or a negative value on error. + */ +int cirbuf_add_buf_tail(struct cirbuf *cbuf, const char *c, unsigned int n); + +/** + * Remove chars at the head of the circular buffer. Return 0 on + * success, or a negative value on error. + */ +int cirbuf_del_buf_head(struct cirbuf *cbuf, unsigned int size); + +/** + * Remove chars at the tail of the circular buffer. Return 0 on + * success, or a negative value on error. + */ +int cirbuf_del_buf_tail(struct cirbuf *cbuf, unsigned int size); + +/** + * Copy a maximum of 'size' characters from the head of the circular + * buffer to a flat one pointed by 'c'. Return the number of copied + * chars. + */ +int cirbuf_get_buf_head(struct cirbuf *cbuf, char *c, unsigned int size); + +/** + * Copy a maximum of 'size' characters from the tail of the circular + * buffer to a flat one pointed by 'c'. Return the number of copied + * chars. + */ +int cirbuf_get_buf_tail(struct cirbuf *cbuf, char *c, unsigned int size); + + +/** + * Set the start of the data to the index 0 of the internal buffer. + */ +int cirbuf_align_left(struct cirbuf *cbuf); + +/** + * Set the end of the data to the last index of the internal buffer. + */ +int cirbuf_align_right(struct cirbuf *cbuf); + +#ifdef __cplusplus +} +#endif + +#endif /* _CIRBUF_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.c new file mode 100644 index 00000000..9666e90c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.c @@ -0,0 +1,532 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "cmdline_rdline.h" +#include "cmdline_parse.h" +#include "cmdline.h" + +#ifdef RTE_LIBRTE_CMDLINE_DEBUG +#define debug_printf printf +#else +#define debug_printf(args...) do {} while(0) +#endif + +#define CMDLINE_BUFFER_SIZE 64 + +/* isblank() needs _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE, so use our + * own. */ +static int +isblank2(char c) +{ + if (c == ' ' || + c == '\t' ) + return 1; + return 0; +} + +static int +isendofline(char c) +{ + if (c == '\n' || + c == '\r' ) + return 1; + return 0; +} + +static int +iscomment(char c) +{ + if (c == '#') + return 1; + return 0; +} + +int +cmdline_isendoftoken(char c) +{ + if (!c || iscomment(c) || isblank2(c) || isendofline(c)) + return 1; + return 0; +} + +int +cmdline_isendofcommand(char c) +{ + if (!c || iscomment(c) || isendofline(c)) + return 1; + return 0; +} + +static unsigned int +nb_common_chars(const char * s1, const char * s2) +{ + unsigned int i=0; + + while (*s1==*s2 && *s1) { + s1++; + s2++; + i++; + } + return i; +} + +/** Retrieve either static or dynamic token at a given index. */ +static cmdline_parse_token_hdr_t * +get_token(cmdline_parse_inst_t *inst, unsigned int index) +{ + cmdline_parse_token_hdr_t *token_p; + + /* check presence of static tokens first */ + if (inst->tokens[0] || !inst->f) + return inst->tokens[index]; + /* generate dynamic token */ + token_p = NULL; + inst->f(&token_p, NULL, &inst->tokens[index]); + return token_p; +} + +/** + * try to match the buffer with an instruction (only the first + * nb_match_token tokens if != 0). Return 0 if we match all the + * tokens, else the number of matched tokens, else -1. + */ +static int +match_inst(cmdline_parse_inst_t *inst, const char *buf, + unsigned int nb_match_token, void *resbuf, unsigned resbuf_size) +{ + cmdline_parse_token_hdr_t *token_p = NULL; + unsigned int i=0; + int n = 0; + struct cmdline_token_hdr token_hdr; + + if (resbuf != NULL) + memset(resbuf, 0, resbuf_size); + /* check if we match all tokens of inst */ + while (!nb_match_token || i < nb_match_token) { + token_p = get_token(inst, i); + if (!token_p) + break; + memcpy(&token_hdr, token_p, sizeof(token_hdr)); + + debug_printf("TK\n"); + /* skip spaces */ + while (isblank2(*buf)) { + buf++; + } + + /* end of buf */ + if ( isendofline(*buf) || iscomment(*buf) ) + break; + + if (resbuf == NULL) { + n = token_hdr.ops->parse(token_p, buf, NULL, 0); + } else { + unsigned rb_sz; + + if (token_hdr.offset > resbuf_size) { + printf("Parse error(%s:%d): Token offset(%u) " + "exceeds maximum size(%u)\n", + __FILE__, __LINE__, + token_hdr.offset, resbuf_size); + return -ENOBUFS; + } + rb_sz = resbuf_size - token_hdr.offset; + + n = token_hdr.ops->parse(token_p, buf, (char *)resbuf + + token_hdr.offset, rb_sz); + } + + if (n < 0) + break; + + debug_printf("TK parsed (len=%d)\n", n); + i++; + buf += n; + } + + /* does not match */ + if (i==0) + return -1; + + /* in case we want to match a specific num of token */ + if (nb_match_token) { + if (i == nb_match_token) { + return 0; + } + return i; + } + + /* we don't match all the tokens */ + if (token_p) { + return i; + } + + /* are there are some tokens more */ + while (isblank2(*buf)) { + buf++; + } + + /* end of buf */ + if ( isendofline(*buf) || iscomment(*buf) ) + return 0; + + /* garbage after inst */ + return i; +} + + +int +cmdline_parse(struct cmdline *cl, const char * buf) +{ + unsigned int inst_num=0; + cmdline_parse_inst_t *inst; + const char *curbuf; + union { + char buf[CMDLINE_PARSE_RESULT_BUFSIZE]; + long double align; /* strong alignment constraint for buf */ + } result, tmp_result; + void (*f)(void *, struct cmdline *, void *) = NULL; + void *data = NULL; + int comment = 0; + int linelen = 0; + int parse_it = 0; + int err = CMDLINE_PARSE_NOMATCH; + int tok; + cmdline_parse_ctx_t *ctx; + char *result_buf = result.buf; + + if (!cl || !buf) + return CMDLINE_PARSE_BAD_ARGS; + + ctx = cl->ctx; + + /* + * - look if the buffer contains at least one line + * - look if line contains only spaces or comments + * - count line length + */ + curbuf = buf; + while (! isendofline(*curbuf)) { + if ( *curbuf == '\0' ) { + debug_printf("Incomplete buf (len=%d)\n", linelen); + return 0; + } + if ( iscomment(*curbuf) ) { + comment = 1; + } + if ( ! isblank2(*curbuf) && ! comment) { + parse_it = 1; + } + curbuf++; + linelen++; + } + + /* skip all endofline chars */ + while (isendofline(buf[linelen])) { + linelen++; + } + + /* empty line */ + if ( parse_it == 0 ) { + debug_printf("Empty line (len=%d)\n", linelen); + return linelen; + } + + debug_printf("Parse line : len=%d, <%.*s>\n", + linelen, linelen > 64 ? 64 : linelen, buf); + + /* parse it !! */ + inst = ctx[inst_num]; + while (inst) { + debug_printf("INST %d\n", inst_num); + + /* fully parsed */ + tok = match_inst(inst, buf, 0, result_buf, + CMDLINE_PARSE_RESULT_BUFSIZE); + + if (tok > 0) /* we matched at least one token */ + err = CMDLINE_PARSE_BAD_ARGS; + + else if (!tok) { + debug_printf("INST fully parsed\n"); + /* skip spaces */ + while (isblank2(*curbuf)) { + curbuf++; + } + + /* if end of buf -> there is no garbage after inst */ + if (isendofline(*curbuf) || iscomment(*curbuf)) { + if (!f) { + memcpy(&f, &inst->f, sizeof(f)); + memcpy(&data, &inst->data, sizeof(data)); + result_buf = tmp_result.buf; + } + else { + /* more than 1 inst matches */ + err = CMDLINE_PARSE_AMBIGUOUS; + f=NULL; + debug_printf("Ambiguous cmd\n"); + break; + } + } + } + + inst_num ++; + inst = ctx[inst_num]; + } + + /* call func */ + if (f) { + f(result.buf, cl, data); + } + + /* no match */ + else { + debug_printf("No match err=%d\n", err); + return err; + } + + return linelen; +} + +int +cmdline_complete(struct cmdline *cl, const char *buf, int *state, + char *dst, unsigned int size) +{ + const char *partial_tok = buf; + unsigned int inst_num = 0; + cmdline_parse_inst_t *inst; + cmdline_parse_token_hdr_t *token_p; + struct cmdline_token_hdr token_hdr; + char tmpbuf[CMDLINE_BUFFER_SIZE], comp_buf[CMDLINE_BUFFER_SIZE]; + unsigned int partial_tok_len; + int comp_len = -1; + int tmp_len = -1; + int nb_token = 0; + unsigned int i, n; + int l; + unsigned int nb_completable; + unsigned int nb_non_completable; + int local_state = 0; + const char *help_str; + cmdline_parse_ctx_t *ctx; + + if (!cl || !buf || !state || !dst) + return -1; + + ctx = cl->ctx; + + debug_printf("%s called\n", __func__); + memset(&token_hdr, 0, sizeof(token_hdr)); + + /* count the number of complete token to parse */ + for (i=0 ; buf[i] ; i++) { + if (!isblank2(buf[i]) && isblank2(buf[i+1])) + nb_token++; + if (isblank2(buf[i]) && !isblank2(buf[i+1])) + partial_tok = buf+i+1; + } + partial_tok_len = strnlen(partial_tok, RDLINE_BUF_SIZE); + + /* first call -> do a first pass */ + if (*state <= 0) { + debug_printf("try complete <%s>\n", buf); + debug_printf("there is %d complete tokens, <%s> is incomplete\n", + nb_token, partial_tok); + + nb_completable = 0; + nb_non_completable = 0; + + inst = ctx[inst_num]; + while (inst) { + /* parse the first tokens of the inst */ + if (nb_token && + match_inst(inst, buf, nb_token, NULL, 0)) + goto next; + + debug_printf("instruction match\n"); + token_p = get_token(inst, nb_token); + if (token_p) + memcpy(&token_hdr, token_p, sizeof(token_hdr)); + + /* non completable */ + if (!token_p || + !token_hdr.ops->complete_get_nb || + !token_hdr.ops->complete_get_elt || + (n = token_hdr.ops->complete_get_nb(token_p)) == 0) { + nb_non_completable++; + goto next; + } + + debug_printf("%d choices for this token\n", n); + for (i=0 ; icomplete_get_elt(token_p, i, + tmpbuf, + sizeof(tmpbuf)) < 0) + continue; + + /* we have at least room for one char */ + tmp_len = strnlen(tmpbuf, sizeof(tmpbuf)); + if (tmp_len < CMDLINE_BUFFER_SIZE - 1) { + tmpbuf[tmp_len] = ' '; + tmpbuf[tmp_len+1] = 0; + } + + debug_printf(" choice <%s>\n", tmpbuf); + + /* does the completion match the + * beginning of the word ? */ + if (!strncmp(partial_tok, tmpbuf, + partial_tok_len)) { + if (comp_len == -1) { + snprintf(comp_buf, sizeof(comp_buf), + "%s", tmpbuf + partial_tok_len); + comp_len = + strnlen(tmpbuf + partial_tok_len, + sizeof(tmpbuf) - partial_tok_len); + + } + else { + comp_len = + nb_common_chars(comp_buf, + tmpbuf+partial_tok_len); + comp_buf[comp_len] = 0; + } + nb_completable++; + } + } + next: + debug_printf("next\n"); + inst_num ++; + inst = ctx[inst_num]; + } + + debug_printf("total choices %d for this completion\n", + nb_completable); + + /* no possible completion */ + if (nb_completable == 0 && nb_non_completable == 0) + return 0; + + /* if multichoice is not required */ + if (*state == 0 && partial_tok_len > 0) { + /* one or several choices starting with the + same chars */ + if (comp_len > 0) { + if ((unsigned)(comp_len + 1) > size) + return 0; + + strlcpy(dst, comp_buf, size); + dst[comp_len] = 0; + return 2; + } + } + } + + /* init state correctly */ + if (*state == -1) + *state = 0; + + debug_printf("Multiple choice STATE=%d\n", *state); + + inst_num = 0; + inst = ctx[inst_num]; + while (inst) { + /* we need to redo it */ + inst = ctx[inst_num]; + + if (nb_token && + match_inst(inst, buf, nb_token, NULL, 0)) + goto next2; + + token_p = get_token(inst, nb_token); + if (token_p) + memcpy(&token_hdr, token_p, sizeof(token_hdr)); + + /* one choice for this token */ + if (!token_p || + !token_hdr.ops->complete_get_nb || + !token_hdr.ops->complete_get_elt || + (n = token_hdr.ops->complete_get_nb(token_p)) == 0) { + if (local_state < *state) { + local_state++; + goto next2; + } + (*state)++; + if (token_p && token_hdr.ops->get_help) { + token_hdr.ops->get_help(token_p, tmpbuf, + sizeof(tmpbuf)); + help_str = inst->help_str; + if (help_str) + snprintf(dst, size, "[%s]: %s", tmpbuf, + help_str); + else + snprintf(dst, size, "[%s]: No help", + tmpbuf); + } + else { + snprintf(dst, size, "[RETURN]"); + } + return 1; + } + + /* several choices */ + for (i=0 ; icomplete_get_elt(token_p, i, tmpbuf, + sizeof(tmpbuf)) < 0) + continue; + /* we have at least room for one char */ + tmp_len = strnlen(tmpbuf, sizeof(tmpbuf)); + if (tmp_len < CMDLINE_BUFFER_SIZE - 1) { + tmpbuf[tmp_len] = ' '; + tmpbuf[tmp_len + 1] = 0; + } + + debug_printf(" choice <%s>\n", tmpbuf); + + /* does the completion match the beginning of + * the word ? */ + if (!strncmp(partial_tok, tmpbuf, + partial_tok_len)) { + if (local_state < *state) { + local_state++; + continue; + } + (*state)++; + l=strlcpy(dst, tmpbuf, size); + if (l>=0 && token_hdr.ops->get_help) { + token_hdr.ops->get_help(token_p, tmpbuf, + sizeof(tmpbuf)); + help_str = inst->help_str; + if (help_str) + snprintf(dst+l, size-l, "[%s]: %s", + tmpbuf, help_str); + else + snprintf(dst+l, size-l, + "[%s]: No help", tmpbuf); + } + + return 1; + } + } + next2: + inst_num ++; + inst = ctx[inst_num]; + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.h new file mode 100644 index 00000000..e4d802ff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _CMDLINE_PARSE_H_ +#define _CMDLINE_PARSE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef offsetof +#define offsetof(type, field) ((size_t) &( ((type *)0)->field) ) +#endif + +/* return status for parsing */ +#define CMDLINE_PARSE_SUCCESS 0 +#define CMDLINE_PARSE_AMBIGUOUS -1 +#define CMDLINE_PARSE_NOMATCH -2 +#define CMDLINE_PARSE_BAD_ARGS -3 + +/* return status for completion */ +#define CMDLINE_PARSE_COMPLETE_FINISHED 0 +#define CMDLINE_PARSE_COMPLETE_AGAIN 1 +#define CMDLINE_PARSE_COMPLETED_BUFFER 2 + +/* maximum buffer size for parsed result */ +#define CMDLINE_PARSE_RESULT_BUFSIZE 8192 + +/** + * Stores a pointer to the ops struct, and the offset: the place to + * write the parsed result in the destination structure. + */ +struct cmdline_token_hdr { + struct cmdline_token_ops *ops; + unsigned int offset; +}; +typedef struct cmdline_token_hdr cmdline_parse_token_hdr_t; + +/** + * A token is defined by this structure. + * + * parse() takes the token as first argument, then the source buffer + * starting at the token we want to parse. The 3rd arg is a pointer + * where we store the parsed data (as binary). It returns the number of + * parsed chars on success and a negative value on error. + * + * complete_get_nb() returns the number of possible values for this + * token if completion is possible. If it is NULL or if it returns 0, + * no completion is possible. + * + * complete_get_elt() copy in dstbuf (the size is specified in the + * parameter) the i-th possible completion for this token. returns 0 + * on success or and a negative value on error. + * + * get_help() fills the dstbuf with the help for the token. It returns + * -1 on error and 0 on success. + */ +struct cmdline_token_ops { + /** parse(token ptr, buf, res pts, buf len) */ + int (*parse)(cmdline_parse_token_hdr_t *, const char *, void *, + unsigned int); + /** return the num of possible choices for this token */ + int (*complete_get_nb)(cmdline_parse_token_hdr_t *); + /** return the elt x for this token (token, idx, dstbuf, size) */ + int (*complete_get_elt)(cmdline_parse_token_hdr_t *, int, char *, + unsigned int); + /** get help for this token (token, dstbuf, size) */ + int (*get_help)(cmdline_parse_token_hdr_t *, char *, unsigned int); +}; + +struct cmdline; +/** + * Store a instruction, which is a pointer to a callback function and + * its parameter that is called when the instruction is parsed, a help + * string, and a list of token composing this instruction. + * + * When no tokens are defined (tokens[0] == NULL), they are retrieved + * dynamically by calling f() as follows: + * + * @code + * + * f((struct cmdline_token_hdr **)&token_p, + * NULL, + * (struct cmdline_token_hdr **)&inst->tokens[num]); + * + * @endcode + * + * The address of the resulting token is expected at the location pointed by + * the first argument. Can be set to NULL to end the list. + * + * The cmdline argument (struct cmdline *) is always NULL. + * + * The last argument points to the inst->tokens[] entry to retrieve, which + * is not necessarily inside allocated memory and should neither be read nor + * written. Its sole purpose is to deduce the token entry index of interest + * as described in the example below. + * + * Note about constraints: + * + * - Only the address of these tokens is dynamic, their storage should be + * static like normal tokens. + * - Dynamic token lists that need to maintain an internal context (e.g. in + * order to determine the next token) must store it statically also. This + * context must be reinitialized when the first token is requested, that + * is, when &inst->tokens[0] is provided as the third argument. + * - Dynamic token lists must be NULL-terminated to generate usable + * commands. + * + * @code + * + * // Assuming first and third arguments are respectively named "token_p" + * // and "token": + * + * int index = token - inst->tokens; + * + * if (!index) { + * [...] // Clean up internal context if any. + * } + * [...] // Then set up dyn_token according to index. + * + * if (no_more_tokens) + * *token_p = NULL; + * else + * *token_p = &dyn_token; + * + * @endcode + */ +struct cmdline_inst { + /* f(parsed_struct, data) */ + void (*f)(void *, struct cmdline *, void *); + void *data; + const char *help_str; + cmdline_parse_token_hdr_t *tokens[]; +}; +typedef struct cmdline_inst cmdline_parse_inst_t; + +/** + * A context is identified by its name, and contains a list of + * instruction + * + */ +typedef cmdline_parse_inst_t *cmdline_parse_ctx_t; + +/** + * Try to parse a buffer according to the specified context. The + * argument buf must ends with "\n\0". The function returns + * CMDLINE_PARSE_AMBIGUOUS, CMDLINE_PARSE_NOMATCH or + * CMDLINE_PARSE_BAD_ARGS on error. Else it calls the associated + * function (defined in the context) and returns 0 + * (CMDLINE_PARSE_SUCCESS). + */ +int cmdline_parse(struct cmdline *cl, const char *buf); + +/** + * complete() must be called with *state==0 (try to complete) or + * with *state==-1 (just display choices), then called without + * modifying *state until it returns CMDLINE_PARSE_COMPLETED_BUFFER or + * CMDLINE_PARSE_COMPLETED_BUFFER. + * + * It returns < 0 on error. + * + * Else it returns: + * - CMDLINE_PARSE_COMPLETED_BUFFER on completion (one possible + * choice). In this case, the chars are appended in dst buffer. + * - CMDLINE_PARSE_COMPLETE_AGAIN if there is several possible + * choices. In this case, you must call the function again, + * keeping the value of state intact. + * - CMDLINE_PARSE_COMPLETED_BUFFER when the iteration is + * finished. The dst is not valid for this last call. + * + * The returned dst buf ends with \0. + */ +int cmdline_complete(struct cmdline *cl, const char *buf, int *state, + char *dst, unsigned int size); + + +/* return true if(!c || iscomment(c) || isblank(c) || + * isendofline(c)) */ +int cmdline_isendoftoken(char c); + +/* return true if(!c || iscomment(c) || isendofline(c)) */ +int cmdline_isendofcommand(char c); + +#ifdef __cplusplus +} +#endif + +#endif /* _CMDLINE_PARSE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.c new file mode 100644 index 00000000..24e04755 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.c @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cmdline_parse.h" +#include "cmdline_parse_etheraddr.h" + +struct cmdline_token_ops cmdline_token_etheraddr_ops = { + .parse = cmdline_parse_etheraddr, + .complete_get_nb = NULL, + .complete_get_elt = NULL, + .get_help = cmdline_get_help_etheraddr, +}; + +/* the format can be either XX:XX:XX:XX:XX:XX or XXXX:XXXX:XXXX */ +#define ETHER_ADDRSTRLENLONG 18 +#define ETHER_ADDRSTRLENSHORT 15 + +#ifdef __linux__ +#define ea_oct ether_addr_octet +#else +#define ea_oct octet +#endif + + +static struct ether_addr * +my_ether_aton(const char *a) +{ + int i; + char *end; + unsigned long o[ETHER_ADDR_LEN]; + static struct ether_addr ether_addr; + + i = 0; + do { + errno = 0; + o[i] = strtoul(a, &end, 16); + if (errno != 0 || end == a || (end[0] != ':' && end[0] != 0)) + return NULL; + a = end + 1; + } while (++i != sizeof (o) / sizeof (o[0]) && end[0] != 0); + + /* Junk at the end of line */ + if (end[0] != 0) + return NULL; + + /* Support the format XX:XX:XX:XX:XX:XX */ + if (i == ETHER_ADDR_LEN) { + while (i-- != 0) { + if (o[i] > UINT8_MAX) + return NULL; + ether_addr.ea_oct[i] = (uint8_t)o[i]; + } + /* Support the format XXXX:XXXX:XXXX */ + } else if (i == ETHER_ADDR_LEN / 2) { + while (i-- != 0) { + if (o[i] > UINT16_MAX) + return NULL; + ether_addr.ea_oct[i * 2] = (uint8_t)(o[i] >> 8); + ether_addr.ea_oct[i * 2 + 1] = (uint8_t)(o[i] & 0xff); + } + /* unknown format */ + } else + return NULL; + + return (struct ether_addr *)ðer_addr; +} + +int +cmdline_parse_etheraddr(__attribute__((unused)) cmdline_parse_token_hdr_t *tk, + const char *buf, void *res, unsigned ressize) +{ + unsigned int token_len = 0; + char ether_str[ETHER_ADDRSTRLENLONG+1]; + struct ether_addr *tmp; + + if (res && ressize < sizeof(struct ether_addr)) + return -1; + + if (!buf || ! *buf) + return -1; + + while (!cmdline_isendoftoken(buf[token_len])) + token_len++; + + /* if token doesn't match possible string lengths... */ + if ((token_len != ETHER_ADDRSTRLENLONG - 1) && + (token_len != ETHER_ADDRSTRLENSHORT - 1)) + return -1; + + strlcpy(ether_str, buf, token_len + 1); + + tmp = my_ether_aton(ether_str); + if (tmp == NULL) + return -1; + if (res) + memcpy(res, tmp, sizeof(struct ether_addr)); + return token_len; +} + +int +cmdline_get_help_etheraddr(__attribute__((unused)) cmdline_parse_token_hdr_t *tk, + char *dstbuf, unsigned int size) +{ + int ret; + + ret = snprintf(dstbuf, size, "Ethernet address"); + if (ret < 0) + return -1; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.h new file mode 100644 index 00000000..d7b95572 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_etheraddr.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _PARSE_ETHERADDR_H_ +#define _PARSE_ETHERADDR_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct cmdline_token_etheraddr { + struct cmdline_token_hdr hdr; +}; +typedef struct cmdline_token_etheraddr cmdline_parse_token_etheraddr_t; + +extern struct cmdline_token_ops cmdline_token_etheraddr_ops; + +int cmdline_parse_etheraddr(cmdline_parse_token_hdr_t *tk, const char *srcbuf, + void *res, unsigned ressize); +int cmdline_get_help_etheraddr(cmdline_parse_token_hdr_t *tk, char *dstbuf, + unsigned int size); + +#define TOKEN_ETHERADDR_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_etheraddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ +} + +#ifdef __cplusplus +} +#endif + + +#endif /* _PARSE_ETHERADDR_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.c new file mode 100644 index 00000000..6647f569 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.c @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef __linux__ +#ifndef __FreeBSD__ +#include +#else +#include +#endif +#endif + +#include + +#include "cmdline_parse.h" +#include "cmdline_parse_ipaddr.h" + +struct cmdline_token_ops cmdline_token_ipaddr_ops = { + .parse = cmdline_parse_ipaddr, + .complete_get_nb = NULL, + .complete_get_elt = NULL, + .get_help = cmdline_get_help_ipaddr, +}; + +#define PREFIXMAX 128 +#define V4PREFIXMAX 32 + +int +cmdline_parse_ipaddr(cmdline_parse_token_hdr_t *tk, const char *buf, void *res, + unsigned ressize) +{ + struct cmdline_token_ipaddr *tk2; + unsigned int token_len = 0; + char ip_str[INET6_ADDRSTRLEN+4+1]; /* '+4' is for prefixlen (if any) */ + cmdline_ipaddr_t ipaddr; + char *prefix, *prefix_end; + long prefixlen = 0; + + if (res && ressize < sizeof(cmdline_ipaddr_t)) + return -1; + + if (!buf || !tk || ! *buf) + return -1; + + tk2 = (struct cmdline_token_ipaddr *)tk; + + while (!cmdline_isendoftoken(buf[token_len])) + token_len++; + + /* if token is too big... */ + if (token_len >= INET6_ADDRSTRLEN+4) + return -1; + + strlcpy(ip_str, buf, token_len + 1); + + /* convert the network prefix */ + if (tk2->ipaddr_data.flags & CMDLINE_IPADDR_NETWORK) { + prefix = strrchr(ip_str, '/'); + if (prefix == NULL) + return -1; + *prefix = '\0'; + prefix ++; + errno = 0; + prefixlen = strtol(prefix, &prefix_end, 10); + if (errno || (*prefix_end != '\0') + || prefixlen < 0 || prefixlen > PREFIXMAX) + return -1; + ipaddr.prefixlen = prefixlen; + } + else { + ipaddr.prefixlen = 0; + } + + /* convert the IP addr */ + if ((tk2->ipaddr_data.flags & CMDLINE_IPADDR_V4) && + inet_pton(AF_INET, ip_str, &ipaddr.addr.ipv4) == 1 && + prefixlen <= V4PREFIXMAX) { + ipaddr.family = AF_INET; + if (res) + memcpy(res, &ipaddr, sizeof(ipaddr)); + return token_len; + } + if ((tk2->ipaddr_data.flags & CMDLINE_IPADDR_V6) && + inet_pton(AF_INET6, ip_str, &ipaddr.addr.ipv6) == 1) { + ipaddr.family = AF_INET6; + if (res) + memcpy(res, &ipaddr, sizeof(ipaddr)); + return token_len; + } + return -1; + +} + +int cmdline_get_help_ipaddr(cmdline_parse_token_hdr_t *tk, char *dstbuf, + unsigned int size) +{ + struct cmdline_token_ipaddr *tk2; + + if (!tk || !dstbuf) + return -1; + + tk2 = (struct cmdline_token_ipaddr *)tk; + + switch (tk2->ipaddr_data.flags) { + case CMDLINE_IPADDR_V4: + snprintf(dstbuf, size, "IPv4"); + break; + case CMDLINE_IPADDR_V6: + snprintf(dstbuf, size, "IPv6"); + break; + case CMDLINE_IPADDR_V4|CMDLINE_IPADDR_V6: + snprintf(dstbuf, size, "IPv4/IPv6"); + break; + case CMDLINE_IPADDR_NETWORK|CMDLINE_IPADDR_V4: + snprintf(dstbuf, size, "IPv4 network"); + break; + case CMDLINE_IPADDR_NETWORK|CMDLINE_IPADDR_V6: + snprintf(dstbuf, size, "IPv6 network"); + break; + case CMDLINE_IPADDR_NETWORK|CMDLINE_IPADDR_V4|CMDLINE_IPADDR_V6: + snprintf(dstbuf, size, "IPv4/IPv6 network"); + break; + default: + snprintf(dstbuf, size, "IPaddr (bad flags)"); + break; + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.h new file mode 100644 index 00000000..0ba81647 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_ipaddr.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _PARSE_IPADDR_H_ +#define _PARSE_IPADDR_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define CMDLINE_IPADDR_V4 0x01 +#define CMDLINE_IPADDR_V6 0x02 +#define CMDLINE_IPADDR_NETWORK 0x04 + +struct cmdline_ipaddr { + uint8_t family; + union { + struct in_addr ipv4; + struct in6_addr ipv6; + } addr; + unsigned int prefixlen; /* in case of network only */ +}; +typedef struct cmdline_ipaddr cmdline_ipaddr_t; + +struct cmdline_token_ipaddr_data { + uint8_t flags; +}; + +struct cmdline_token_ipaddr { + struct cmdline_token_hdr hdr; + struct cmdline_token_ipaddr_data ipaddr_data; +}; +typedef struct cmdline_token_ipaddr cmdline_parse_token_ipaddr_t; + +extern struct cmdline_token_ops cmdline_token_ipaddr_ops; + +int cmdline_parse_ipaddr(cmdline_parse_token_hdr_t *tk, const char *srcbuf, + void *res, unsigned ressize); +int cmdline_get_help_ipaddr(cmdline_parse_token_hdr_t *tk, char *dstbuf, + unsigned int size); + +#define TOKEN_IPADDR_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_ipaddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* ipaddr_data */ \ + { \ + CMDLINE_IPADDR_V4 | /* flags */ \ + CMDLINE_IPADDR_V6, \ + }, \ +} + +#define TOKEN_IPV4_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_ipaddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* ipaddr_data */ \ + { \ + CMDLINE_IPADDR_V4, /* flags */ \ + }, \ +} + +#define TOKEN_IPV6_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_ipaddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* ipaddr_data */ \ + { \ + CMDLINE_IPADDR_V6, /* flags */ \ + }, \ +} + +#define TOKEN_IPNET_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_ipaddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* ipaddr_data */ \ + { \ + CMDLINE_IPADDR_V4 | /* flags */ \ + CMDLINE_IPADDR_V6 | \ + CMDLINE_IPADDR_NETWORK, \ + }, \ +} + +#define TOKEN_IPV4NET_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_ipaddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* ipaddr_data */ \ + { \ + CMDLINE_IPADDR_V4 | /* flags */ \ + CMDLINE_IPADDR_NETWORK, \ + }, \ +} + +#define TOKEN_IPV6NET_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_ipaddr_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* ipaddr_data */ \ + { \ + CMDLINE_IPADDR_V4 | /* flags */ \ + CMDLINE_IPADDR_NETWORK, \ + }, \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PARSE_IPADDR_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.c new file mode 100644 index 00000000..55c7a814 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.c @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline_parse.h" +#include "cmdline_parse_num.h" + +#ifdef RTE_LIBRTE_CMDLINE_DEBUG +#define debug_printf(args...) printf(args) +#else +#define debug_printf(args...) do {} while(0) +#endif + +struct cmdline_token_ops cmdline_token_num_ops = { + .parse = cmdline_parse_num, + .complete_get_nb = NULL, + .complete_get_elt = NULL, + .get_help = cmdline_get_help_num, +}; + + +enum num_parse_state_t { + START, + DEC_NEG, + BIN, + HEX, + + ERROR, + + FIRST_OK, /* not used */ + ZERO_OK, + HEX_OK, + OCTAL_OK, + BIN_OK, + DEC_NEG_OK, + DEC_POS_OK, +}; + +/* Keep it sync with enum in .h */ +static const char * num_help[] = { + "UINT8", "UINT16", "UINT32", "UINT64", + "INT8", "INT16", "INT32", "INT64", +}; + +static inline int +add_to_res(unsigned int c, uint64_t *res, unsigned int base) +{ + /* overflow */ + if ( (UINT64_MAX - c) / base < *res ) { + return -1; + } + + *res = (uint64_t) (*res * base + c); + return 0; +} + +static int +check_res_size(struct cmdline_token_num_data *nd, unsigned ressize) +{ + switch (nd->type) { + case INT8: + case UINT8: + if (ressize < sizeof(int8_t)) + return -1; + break; + case INT16: + case UINT16: + if (ressize < sizeof(int16_t)) + return -1; + break; + case INT32: + case UINT32: + if (ressize < sizeof(int32_t)) + return -1; + break; + case INT64: + case UINT64: + if (ressize < sizeof(int64_t)) + return -1; + break; + default: + return -1; + } + return 0; +} + +/* parse an int */ +int +cmdline_parse_num(cmdline_parse_token_hdr_t *tk, const char *srcbuf, void *res, + unsigned ressize) +{ + struct cmdline_token_num_data nd; + enum num_parse_state_t st = START; + const char * buf; + char c; + uint64_t res1 = 0; + + if (!tk) + return -1; + + if (!srcbuf || !*srcbuf) + return -1; + + buf = srcbuf; + c = *buf; + + memcpy(&nd, &((struct cmdline_token_num *)tk)->num_data, sizeof(nd)); + + /* check that we have enough room in res */ + if (res) { + if (check_res_size(&nd, ressize) < 0) + return -1; + } + + while ( st != ERROR && c && ! cmdline_isendoftoken(c) ) { + debug_printf("%c %x -> ", c, c); + switch (st) { + case START: + if (c == '-') { + st = DEC_NEG; + } + else if (c == '0') { + st = ZERO_OK; + } + else if (c >= '1' && c <= '9') { + if (add_to_res(c - '0', &res1, 10) < 0) + st = ERROR; + else + st = DEC_POS_OK; + } + else { + st = ERROR; + } + break; + + case ZERO_OK: + if (c == 'x') { + st = HEX; + } + else if (c == 'b') { + st = BIN; + } + else if (c >= '0' && c <= '7') { + if (add_to_res(c - '0', &res1, 10) < 0) + st = ERROR; + else + st = OCTAL_OK; + } + else { + st = ERROR; + } + break; + + case DEC_NEG: + if (c >= '0' && c <= '9') { + if (add_to_res(c - '0', &res1, 10) < 0) + st = ERROR; + else + st = DEC_NEG_OK; + } + else { + st = ERROR; + } + break; + + case DEC_NEG_OK: + if (c >= '0' && c <= '9') { + if (add_to_res(c - '0', &res1, 10) < 0) + st = ERROR; + } + else { + st = ERROR; + } + break; + + case DEC_POS_OK: + if (c >= '0' && c <= '9') { + if (add_to_res(c - '0', &res1, 10) < 0) + st = ERROR; + } + else { + st = ERROR; + } + break; + + case HEX: + st = HEX_OK; + /* fall-through no break */ + case HEX_OK: + if (c >= '0' && c <= '9') { + if (add_to_res(c - '0', &res1, 16) < 0) + st = ERROR; + } + else if (c >= 'a' && c <= 'f') { + if (add_to_res(c - 'a' + 10, &res1, 16) < 0) + st = ERROR; + } + else if (c >= 'A' && c <= 'F') { + if (add_to_res(c - 'A' + 10, &res1, 16) < 0) + st = ERROR; + } + else { + st = ERROR; + } + break; + + + case OCTAL_OK: + if (c >= '0' && c <= '7') { + if (add_to_res(c - '0', &res1, 8) < 0) + st = ERROR; + } + else { + st = ERROR; + } + break; + + case BIN: + st = BIN_OK; + /* fall-through */ + case BIN_OK: + if (c >= '0' && c <= '1') { + if (add_to_res(c - '0', &res1, 2) < 0) + st = ERROR; + } + else { + st = ERROR; + } + break; + default: + debug_printf("not impl "); + + } + + debug_printf("(%"PRIu64")\n", res1); + + buf ++; + c = *buf; + + /* token too long */ + if (buf-srcbuf > 127) + return -1; + } + + switch (st) { + case ZERO_OK: + case DEC_POS_OK: + case HEX_OK: + case OCTAL_OK: + case BIN_OK: + if ( nd.type == INT8 && res1 <= INT8_MAX ) { + if (res) *(int8_t *)res = (int8_t) res1; + return buf-srcbuf; + } + else if ( nd.type == INT16 && res1 <= INT16_MAX ) { + if (res) *(int16_t *)res = (int16_t) res1; + return buf-srcbuf; + } + else if ( nd.type == INT32 && res1 <= INT32_MAX ) { + if (res) *(int32_t *)res = (int32_t) res1; + return buf-srcbuf; + } + else if ( nd.type == INT64 && res1 <= INT64_MAX ) { + if (res) *(int64_t *)res = (int64_t) res1; + return buf-srcbuf; + } + else if ( nd.type == UINT8 && res1 <= UINT8_MAX ) { + if (res) *(uint8_t *)res = (uint8_t) res1; + return buf-srcbuf; + } + else if (nd.type == UINT16 && res1 <= UINT16_MAX ) { + if (res) *(uint16_t *)res = (uint16_t) res1; + return buf-srcbuf; + } + else if ( nd.type == UINT32 && res1 <= UINT32_MAX ) { + if (res) *(uint32_t *)res = (uint32_t) res1; + return buf-srcbuf; + } + else if ( nd.type == UINT64 ) { + if (res) *(uint64_t *)res = res1; + return buf-srcbuf; + } + else { + return -1; + } + break; + + case DEC_NEG_OK: + if ( nd.type == INT8 && res1 <= INT8_MAX + 1 ) { + if (res) *(int8_t *)res = (int8_t) (-res1); + return buf-srcbuf; + } + else if ( nd.type == INT16 && res1 <= (uint16_t)INT16_MAX + 1 ) { + if (res) *(int16_t *)res = (int16_t) (-res1); + return buf-srcbuf; + } + else if ( nd.type == INT32 && res1 <= (uint32_t)INT32_MAX + 1 ) { + if (res) *(int32_t *)res = (int32_t) (-res1); + return buf-srcbuf; + } + else if ( nd.type == INT64 && res1 <= (uint64_t)INT64_MAX + 1 ) { + if (res) *(int64_t *)res = (int64_t) (-res1); + return buf-srcbuf; + } + else { + return -1; + } + break; + default: + debug_printf("error\n"); + return -1; + } +} + + +/* parse an int */ +int +cmdline_get_help_num(cmdline_parse_token_hdr_t *tk, char *dstbuf, unsigned int size) +{ + struct cmdline_token_num_data nd; + int ret; + + if (!tk) + return -1; + + memcpy(&nd, &((struct cmdline_token_num *)tk)->num_data, sizeof(nd)); + + /* should not happen.... don't so this test */ + /* if (nd.type >= (sizeof(num_help)/sizeof(const char *))) */ + /* return -1; */ + + ret = snprintf(dstbuf, size, "%s", num_help[nd.type]); + if (ret < 0) + return -1; + dstbuf[size-1] = '\0'; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.h new file mode 100644 index 00000000..58b28cad --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_num.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _PARSE_NUM_H_ +#define _PARSE_NUM_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum cmdline_numtype { + UINT8 = 0, + UINT16, + UINT32, + UINT64, + INT8, + INT16, + INT32, + INT64 +}; + +struct cmdline_token_num_data { + enum cmdline_numtype type; +}; + +struct cmdline_token_num { + struct cmdline_token_hdr hdr; + struct cmdline_token_num_data num_data; +}; +typedef struct cmdline_token_num cmdline_parse_token_num_t; + +extern struct cmdline_token_ops cmdline_token_num_ops; + +int cmdline_parse_num(cmdline_parse_token_hdr_t *tk, + const char *srcbuf, void *res, unsigned ressize); +int cmdline_get_help_num(cmdline_parse_token_hdr_t *tk, + char *dstbuf, unsigned int size); + +#define TOKEN_NUM_INITIALIZER(structure, field, numtype) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_num_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* num_data */ \ + { \ + numtype, /* type */ \ + }, \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PARSE_NUM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.c new file mode 100644 index 00000000..ad43b522 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2010, Keith Wiles + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "cmdline_parse.h" +#include "cmdline_parse_portlist.h" + +struct cmdline_token_ops cmdline_token_portlist_ops = { + .parse = cmdline_parse_portlist, + .complete_get_nb = NULL, + .complete_get_elt = NULL, + .get_help = cmdline_get_help_portlist, +}; + +static void +parse_set_list(cmdline_portlist_t *pl, size_t low, size_t high) +{ + do { + pl->map |= (1 << low++); + } while (low <= high); +} + +static int +parse_ports(cmdline_portlist_t *pl, const char *str) +{ + size_t ps, pe; + const char *first, *last; + char *end; + + for (first = str, last = first; + first != NULL && last != NULL; + first = last + 1) { + + last = strchr(first, ','); + + errno = 0; + ps = strtoul(first, &end, 10); + if (errno != 0 || end == first || + (end[0] != '-' && end[0] != 0 && end != last)) + return -1; + + /* Support for N-M portlist format */ + if (end[0] == '-') { + errno = 0; + first = end + 1; + pe = strtoul(first, &end, 10); + if (errno != 0 || end == first || + (end[0] != 0 && end != last)) + return -1; + } else { + pe = ps; + } + + if (ps > pe || pe >= sizeof (pl->map) * 8) + return -1; + + parse_set_list(pl, ps, pe); + } + + return 0; +} + +int +cmdline_parse_portlist(__attribute__((unused)) cmdline_parse_token_hdr_t *tk, + const char *buf, void *res, unsigned ressize) +{ + unsigned int token_len = 0; + char portlist_str[PORTLIST_TOKEN_SIZE+1]; + cmdline_portlist_t *pl; + + if (!buf || ! *buf) + return -1; + + if (res && ressize < sizeof(cmdline_portlist_t)) + return -1; + + pl = res; + + while (!cmdline_isendoftoken(buf[token_len]) && + (token_len < PORTLIST_TOKEN_SIZE)) + token_len++; + + if (token_len >= PORTLIST_TOKEN_SIZE) + return -1; + + strlcpy(portlist_str, buf, token_len + 1); + + if (pl) { + pl->map = 0; + if (strcmp("all", portlist_str) == 0) + pl->map = UINT32_MAX; + else if (parse_ports(pl, portlist_str) != 0) + return -1; + } + + return token_len; +} + +int +cmdline_get_help_portlist(__attribute__((unused)) cmdline_parse_token_hdr_t *tk, + char *dstbuf, unsigned int size) +{ + int ret; + ret = snprintf(dstbuf, size, "range of ports as 3,4-6,8-19,20"); + if (ret < 0) + return -1; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.h new file mode 100644 index 00000000..bdf8d77f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_portlist.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2010, Keith Wiles + * All rights reserved. + */ + +#ifndef _PARSE_PORTLIST_H_ +#define _PARSE_PORTLIST_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* size of a parsed string */ +#define PORTLIST_TOKEN_SIZE 128 +#define PORTLIST_MAX_TOKENS 32 + +typedef struct cmdline_portlist { + uint32_t map; +} cmdline_portlist_t; + +struct cmdline_token_portlist { + struct cmdline_token_hdr hdr; +}; +typedef struct cmdline_token_portlist cmdline_parse_token_portlist_t; + +extern struct cmdline_token_ops cmdline_token_portlist_ops; + +int cmdline_parse_portlist(cmdline_parse_token_hdr_t *tk, + const char *srcbuf, void *res, unsigned ressize); +int cmdline_get_help_portlist(cmdline_parse_token_hdr_t *tk, + char *dstbuf, unsigned int size); + +#define TOKEN_PORTLIST_INITIALIZER(structure, field) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_portlist_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PARSE_PORTLIST_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.c new file mode 100644 index 00000000..9cf41d0f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.c @@ -0,0 +1,220 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline_parse.h" +#include "cmdline_parse_string.h" + +struct cmdline_token_ops cmdline_token_string_ops = { + .parse = cmdline_parse_string, + .complete_get_nb = cmdline_complete_get_nb_string, + .complete_get_elt = cmdline_complete_get_elt_string, + .get_help = cmdline_get_help_string, +}; + +#define CHOICESTRING_HELP "Mul-choice STRING" +#define ANYSTRING_HELP "Any STRING" +#define ANYSTRINGS_HELP "Any STRINGS" +#define FIXEDSTRING_HELP "Fixed STRING" + +static unsigned int +get_token_len(const char *s) +{ + char c; + unsigned int i=0; + + c = s[i]; + while (c!='#' && c!='\0') { + i++; + c = s[i]; + } + return i; +} + +static const char * +get_next_token(const char *s) +{ + unsigned int i; + i = get_token_len(s); + if (s[i] == '#') + return s+i+1; + return NULL; +} + +int +cmdline_parse_string(cmdline_parse_token_hdr_t *tk, const char *buf, void *res, + unsigned ressize) +{ + struct cmdline_token_string *tk2; + struct cmdline_token_string_data *sd; + unsigned int token_len; + const char *str; + + if (res && ressize < STR_TOKEN_SIZE) + return -1; + + if (!tk || !buf || ! *buf) + return -1; + + tk2 = (struct cmdline_token_string *)tk; + + sd = &tk2->string_data; + + /* fixed string (known single token) */ + if ((sd->str != NULL) && (strcmp(sd->str, TOKEN_STRING_MULTI) != 0)) { + str = sd->str; + do { + token_len = get_token_len(str); + + /* if token is too big... */ + if (token_len >= STR_TOKEN_SIZE - 1) { + continue; + } + + if ( strncmp(buf, str, token_len) ) { + continue; + } + + if ( !cmdline_isendoftoken(*(buf+token_len)) ) { + continue; + } + + break; + } while ( (str = get_next_token(str)) != NULL ); + + if (!str) + return -1; + } + /* multi string */ + else if (sd->str != NULL) { + if (ressize < STR_MULTI_TOKEN_SIZE) + return -1; + + token_len = 0; + while (!cmdline_isendofcommand(buf[token_len]) && + token_len < (STR_MULTI_TOKEN_SIZE - 1)) + token_len++; + + /* return if token too long */ + if (token_len >= (STR_MULTI_TOKEN_SIZE - 1)) + return -1; + } + /* unspecified string (unknown single token) */ + else { + token_len = 0; + while(!cmdline_isendoftoken(buf[token_len]) && + token_len < (STR_TOKEN_SIZE-1)) + token_len++; + + /* return if token too long */ + if (token_len >= STR_TOKEN_SIZE - 1) { + return -1; + } + } + + if (res) { + if ((sd->str != NULL) && (strcmp(sd->str, TOKEN_STRING_MULTI) == 0)) + /* we are sure that token_len is < STR_MULTI_TOKEN_SIZE-1 */ + strlcpy(res, buf, STR_MULTI_TOKEN_SIZE); + else + /* we are sure that token_len is < STR_TOKEN_SIZE-1 */ + strlcpy(res, buf, STR_TOKEN_SIZE); + + *((char *)res + token_len) = 0; + } + + return token_len; +} + +int cmdline_complete_get_nb_string(cmdline_parse_token_hdr_t *tk) +{ + struct cmdline_token_string *tk2; + struct cmdline_token_string_data *sd; + const char *str; + int ret = 1; + + if (!tk) + return -1; + + tk2 = (struct cmdline_token_string *)tk; + sd = &tk2->string_data; + + if (!sd->str) + return 0; + + str = sd->str; + while( (str = get_next_token(str)) != NULL ) { + ret++; + } + return ret; +} + +int cmdline_complete_get_elt_string(cmdline_parse_token_hdr_t *tk, int idx, + char *dstbuf, unsigned int size) +{ + struct cmdline_token_string *tk2; + struct cmdline_token_string_data *sd; + const char *s; + unsigned int len; + + if (!tk || !dstbuf || idx < 0) + return -1; + + tk2 = (struct cmdline_token_string *)tk; + sd = &tk2->string_data; + + s = sd->str; + + while (idx-- && s) + s = get_next_token(s); + + if (!s) + return -1; + + len = get_token_len(s); + if (len > size - 1) + return -1; + + memcpy(dstbuf, s, len); + dstbuf[len] = '\0'; + return 0; +} + + +int cmdline_get_help_string(cmdline_parse_token_hdr_t *tk, char *dstbuf, + unsigned int size) +{ + struct cmdline_token_string *tk2; + struct cmdline_token_string_data *sd; + const char *s; + + if (!tk || !dstbuf) + return -1; + + tk2 = (struct cmdline_token_string *)tk; + sd = &tk2->string_data; + + s = sd->str; + + if (s) { + if (strcmp(s, TOKEN_STRING_MULTI) == 0) + snprintf(dstbuf, size, ANYSTRINGS_HELP); + else if (get_next_token(s)) + snprintf(dstbuf, size, CHOICESTRING_HELP); + else + snprintf(dstbuf, size, FIXEDSTRING_HELP); + } else + snprintf(dstbuf, size, ANYSTRING_HELP); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.h new file mode 100644 index 00000000..52a26670 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_parse_string.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _PARSE_STRING_H_ +#define _PARSE_STRING_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* size of a parsed string */ +#define STR_TOKEN_SIZE 128 + +/* size of a parsed multi string */ +#define STR_MULTI_TOKEN_SIZE 4096 + +typedef char cmdline_fixed_string_t[STR_TOKEN_SIZE]; + +typedef char cmdline_multi_string_t[STR_MULTI_TOKEN_SIZE]; + +struct cmdline_token_string_data { + const char *str; +}; + +struct cmdline_token_string { + struct cmdline_token_hdr hdr; + struct cmdline_token_string_data string_data; +}; +typedef struct cmdline_token_string cmdline_parse_token_string_t; + +extern struct cmdline_token_ops cmdline_token_string_ops; + +int cmdline_parse_string(cmdline_parse_token_hdr_t *tk, const char *srcbuf, + void *res, unsigned ressize); +int cmdline_complete_get_nb_string(cmdline_parse_token_hdr_t *tk); +int cmdline_complete_get_elt_string(cmdline_parse_token_hdr_t *tk, int idx, + char *dstbuf, unsigned int size); +int cmdline_get_help_string(cmdline_parse_token_hdr_t *tk, char *dstbuf, + unsigned int size); + +/** +* Token marked as TOKEN_STRING_MULTI takes entire parsing string +* until “#” sign appear. Everything after “#” sign is treated as a comment. +* +* Note: +* In this case second parameter of TOKEN_STRING_INITIALIZER must be a type of +* cmdline_multi_string_t. +*/ +#define TOKEN_STRING_MULTI "" + +#define TOKEN_STRING_INITIALIZER(structure, field, string) \ +{ \ + /* hdr */ \ + { \ + &cmdline_token_string_ops, /* ops */ \ + offsetof(structure, field), /* offset */ \ + }, \ + /* string_data */ \ + { \ + string, /* str */ \ + }, \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PARSE_STRING_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.c new file mode 100644 index 00000000..2cb53e38 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.c @@ -0,0 +1,644 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline_cirbuf.h" +#include "cmdline_rdline.h" + +static void rdline_puts(struct rdline *rdl, const char *buf); +static void rdline_miniprintf(struct rdline *rdl, + const char *buf, unsigned int val); + +static void rdline_remove_old_history_item(struct rdline *rdl); +static void rdline_remove_first_history_item(struct rdline *rdl); +static unsigned int rdline_get_history_size(struct rdline *rdl); + + +/* isblank() needs _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE, so use our + * own. */ +static int +isblank2(char c) +{ + if (c == ' ' || + c == '\t' ) + return 1; + return 0; +} + +int +rdline_init(struct rdline *rdl, + rdline_write_char_t *write_char, + rdline_validate_t *validate, + rdline_complete_t *complete) +{ + if (!rdl || !write_char || !validate || !complete) + return -EINVAL; + memset(rdl, 0, sizeof(*rdl)); + rdl->validate = validate; + rdl->complete = complete; + rdl->write_char = write_char; + rdl->status = RDLINE_INIT; + return cirbuf_init(&rdl->history, rdl->history_buf, 0, RDLINE_HISTORY_BUF_SIZE); +} + +void +rdline_newline(struct rdline *rdl, const char *prompt) +{ + unsigned int i; + + if (!rdl || !prompt) + return; + + vt100_init(&rdl->vt100); + cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE); + cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE); + + rdl->prompt_size = strnlen(prompt, RDLINE_PROMPT_SIZE-1); + if (prompt != rdl->prompt) + memcpy(rdl->prompt, prompt, rdl->prompt_size); + rdl->prompt[RDLINE_PROMPT_SIZE-1] = '\0'; + + for (i=0 ; iprompt_size ; i++) + rdl->write_char(rdl, rdl->prompt[i]); + rdl->status = RDLINE_RUNNING; + + rdl->history_cur_line = -1; +} + +void +rdline_stop(struct rdline *rdl) +{ + if (!rdl) + return; + rdl->status = RDLINE_INIT; +} + +void +rdline_quit(struct rdline *rdl) +{ + if (!rdl) + return; + rdl->status = RDLINE_EXITED; +} + +void +rdline_restart(struct rdline *rdl) +{ + if (!rdl) + return; + rdl->status = RDLINE_RUNNING; +} + +void +rdline_reset(struct rdline *rdl) +{ + if (!rdl) + return; + vt100_init(&rdl->vt100); + cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE); + cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE); + + rdl->status = RDLINE_RUNNING; + + rdl->history_cur_line = -1; +} + +const char * +rdline_get_buffer(struct rdline *rdl) +{ + if (!rdl) + return NULL; + unsigned int len_l, len_r; + cirbuf_align_left(&rdl->left); + cirbuf_align_left(&rdl->right); + + len_l = CIRBUF_GET_LEN(&rdl->left); + len_r = CIRBUF_GET_LEN(&rdl->right); + memcpy(rdl->left_buf+len_l, rdl->right_buf, len_r); + + rdl->left_buf[len_l + len_r] = '\n'; + rdl->left_buf[len_l + len_r + 1] = '\0'; + return rdl->left_buf; +} + +static void +display_right_buffer(struct rdline *rdl, int force) +{ + unsigned int i; + char tmp; + + if (!force && CIRBUF_IS_EMPTY(&rdl->right)) + return; + + rdline_puts(rdl, vt100_clear_right); + CIRBUF_FOREACH(&rdl->right, i, tmp) { + rdl->write_char(rdl, tmp); + } + if (!CIRBUF_IS_EMPTY(&rdl->right)) + rdline_miniprintf(rdl, vt100_multi_left, + CIRBUF_GET_LEN(&rdl->right)); +} + +void +rdline_redisplay(struct rdline *rdl) +{ + unsigned int i; + char tmp; + + if (!rdl) + return; + + rdline_puts(rdl, vt100_home); + for (i=0 ; iprompt_size ; i++) + rdl->write_char(rdl, rdl->prompt[i]); + CIRBUF_FOREACH(&rdl->left, i, tmp) { + rdl->write_char(rdl, tmp); + } + display_right_buffer(rdl, 1); +} + +int +rdline_char_in(struct rdline *rdl, char c) +{ + unsigned int i; + int cmd; + char tmp; + char *buf; + + if (!rdl) + return -EINVAL; + + if (rdl->status == RDLINE_EXITED) + return RDLINE_RES_EXITED; + if (rdl->status != RDLINE_RUNNING) + return RDLINE_RES_NOT_RUNNING; + + cmd = vt100_parser(&rdl->vt100, c); + if (cmd == -2) + return RDLINE_RES_SUCCESS; + + if (cmd >= 0) { + switch (cmd) { + /* move caret 1 char to the left */ + case CMDLINE_KEY_CTRL_B: + case CMDLINE_KEY_LEFT_ARR: + if (CIRBUF_IS_EMPTY(&rdl->left)) + break; + tmp = cirbuf_get_tail(&rdl->left); + cirbuf_del_tail(&rdl->left); + cirbuf_add_head(&rdl->right, tmp); + rdline_puts(rdl, vt100_left_arr); + break; + + /* move caret 1 char to the right */ + case CMDLINE_KEY_CTRL_F: + case CMDLINE_KEY_RIGHT_ARR: + if (CIRBUF_IS_EMPTY(&rdl->right)) + break; + tmp = cirbuf_get_head(&rdl->right); + cirbuf_del_head(&rdl->right); + cirbuf_add_tail(&rdl->left, tmp); + rdline_puts(rdl, vt100_right_arr); + break; + + /* move caret 1 word to the left */ + /* keyboard equivalent: Alt+B */ + case CMDLINE_KEY_WLEFT: + while (! CIRBUF_IS_EMPTY(&rdl->left) && + (tmp = cirbuf_get_tail(&rdl->left)) && + isblank2(tmp)) { + rdline_puts(rdl, vt100_left_arr); + cirbuf_del_tail(&rdl->left); + cirbuf_add_head(&rdl->right, tmp); + } + while (! CIRBUF_IS_EMPTY(&rdl->left) && + (tmp = cirbuf_get_tail(&rdl->left)) && + !isblank2(tmp)) { + rdline_puts(rdl, vt100_left_arr); + cirbuf_del_tail(&rdl->left); + cirbuf_add_head(&rdl->right, tmp); + } + break; + + /* move caret 1 word to the right */ + /* keyboard equivalent: Alt+F */ + case CMDLINE_KEY_WRIGHT: + while (! CIRBUF_IS_EMPTY(&rdl->right) && + (tmp = cirbuf_get_head(&rdl->right)) && + isblank2(tmp)) { + rdline_puts(rdl, vt100_right_arr); + cirbuf_del_head(&rdl->right); + cirbuf_add_tail(&rdl->left, tmp); + } + while (! CIRBUF_IS_EMPTY(&rdl->right) && + (tmp = cirbuf_get_head(&rdl->right)) && + !isblank2(tmp)) { + rdline_puts(rdl, vt100_right_arr); + cirbuf_del_head(&rdl->right); + cirbuf_add_tail(&rdl->left, tmp); + } + break; + + /* move caret to the left */ + case CMDLINE_KEY_CTRL_A: + if (CIRBUF_IS_EMPTY(&rdl->left)) + break; + rdline_miniprintf(rdl, vt100_multi_left, + CIRBUF_GET_LEN(&rdl->left)); + while (! CIRBUF_IS_EMPTY(&rdl->left)) { + tmp = cirbuf_get_tail(&rdl->left); + cirbuf_del_tail(&rdl->left); + cirbuf_add_head(&rdl->right, tmp); + } + break; + + /* move caret to the right */ + case CMDLINE_KEY_CTRL_E: + if (CIRBUF_IS_EMPTY(&rdl->right)) + break; + rdline_miniprintf(rdl, vt100_multi_right, + CIRBUF_GET_LEN(&rdl->right)); + while (! CIRBUF_IS_EMPTY(&rdl->right)) { + tmp = cirbuf_get_head(&rdl->right); + cirbuf_del_head(&rdl->right); + cirbuf_add_tail(&rdl->left, tmp); + } + break; + + /* delete 1 char from the left */ + case CMDLINE_KEY_BKSPACE: + case CMDLINE_KEY_BKSPACE2: + if(!cirbuf_del_tail_safe(&rdl->left)) { + rdline_puts(rdl, vt100_bs); + display_right_buffer(rdl, 1); + } + break; + + /* delete 1 char from the right */ + case CMDLINE_KEY_SUPPR: + case CMDLINE_KEY_CTRL_D: + if (cmd == CMDLINE_KEY_CTRL_D && + CIRBUF_IS_EMPTY(&rdl->left) && + CIRBUF_IS_EMPTY(&rdl->right)) { + return RDLINE_RES_EOF; + } + if (!cirbuf_del_head_safe(&rdl->right)) { + display_right_buffer(rdl, 1); + } + break; + + /* delete 1 word from the left */ + case CMDLINE_KEY_META_BKSPACE: + case CMDLINE_KEY_CTRL_W: + while (! CIRBUF_IS_EMPTY(&rdl->left) && isblank2(cirbuf_get_tail(&rdl->left))) { + rdline_puts(rdl, vt100_bs); + cirbuf_del_tail(&rdl->left); + } + while (! CIRBUF_IS_EMPTY(&rdl->left) && !isblank2(cirbuf_get_tail(&rdl->left))) { + rdline_puts(rdl, vt100_bs); + cirbuf_del_tail(&rdl->left); + } + display_right_buffer(rdl, 1); + break; + + /* delete 1 word from the right */ + case CMDLINE_KEY_META_D: + while (! CIRBUF_IS_EMPTY(&rdl->right) && isblank2(cirbuf_get_head(&rdl->right))) + cirbuf_del_head(&rdl->right); + while (! CIRBUF_IS_EMPTY(&rdl->right) && !isblank2(cirbuf_get_head(&rdl->right))) + cirbuf_del_head(&rdl->right); + display_right_buffer(rdl, 1); + break; + + /* set kill buffer to contents on the right side of caret */ + case CMDLINE_KEY_CTRL_K: + cirbuf_get_buf_head(&rdl->right, rdl->kill_buf, RDLINE_BUF_SIZE); + rdl->kill_size = CIRBUF_GET_LEN(&rdl->right); + cirbuf_del_buf_head(&rdl->right, rdl->kill_size); + rdline_puts(rdl, vt100_clear_right); + break; + + /* paste contents of kill buffer to the left side of caret */ + case CMDLINE_KEY_CTRL_Y: + i=0; + while(CIRBUF_GET_LEN(&rdl->right) + CIRBUF_GET_LEN(&rdl->left) < + RDLINE_BUF_SIZE && + i < rdl->kill_size) { + cirbuf_add_tail(&rdl->left, rdl->kill_buf[i]); + rdl->write_char(rdl, rdl->kill_buf[i]); + i++; + } + display_right_buffer(rdl, 0); + break; + + /* clear and newline */ + case CMDLINE_KEY_CTRL_C: + rdline_puts(rdl, "\r\n"); + rdline_newline(rdl, rdl->prompt); + break; + + /* redisplay (helps when prompt is lost in other output) */ + case CMDLINE_KEY_CTRL_L: + rdline_redisplay(rdl); + break; + + /* autocomplete */ + case CMDLINE_KEY_TAB: + case CMDLINE_KEY_HELP: + cirbuf_align_left(&rdl->left); + rdl->left_buf[CIRBUF_GET_LEN(&rdl->left)] = '\0'; + if (rdl->complete) { + char tmp_buf[BUFSIZ]; + int complete_state; + int ret; + unsigned int tmp_size; + + if (cmd == CMDLINE_KEY_TAB) + complete_state = 0; + else + complete_state = -1; + + /* see in parse.h for help on complete() */ + ret = rdl->complete(rdl, rdl->left_buf, + tmp_buf, sizeof(tmp_buf), + &complete_state); + /* no completion or error */ + if (ret <= 0) { + return RDLINE_RES_COMPLETE; + } + + tmp_size = strnlen(tmp_buf, sizeof(tmp_buf)); + /* add chars */ + if (ret == RDLINE_RES_COMPLETE) { + i=0; + while(CIRBUF_GET_LEN(&rdl->right) + CIRBUF_GET_LEN(&rdl->left) < + RDLINE_BUF_SIZE && + i < tmp_size) { + cirbuf_add_tail(&rdl->left, tmp_buf[i]); + rdl->write_char(rdl, tmp_buf[i]); + i++; + } + display_right_buffer(rdl, 1); + return RDLINE_RES_COMPLETE; /* ?? */ + } + + /* choice */ + rdline_puts(rdl, "\r\n"); + while (ret) { + rdl->write_char(rdl, ' '); + for (i=0 ; tmp_buf[i] ; i++) + rdl->write_char(rdl, tmp_buf[i]); + rdline_puts(rdl, "\r\n"); + ret = rdl->complete(rdl, rdl->left_buf, + tmp_buf, sizeof(tmp_buf), + &complete_state); + } + + rdline_redisplay(rdl); + } + return RDLINE_RES_COMPLETE; + + /* complete buffer */ + case CMDLINE_KEY_RETURN: + case CMDLINE_KEY_RETURN2: + rdline_get_buffer(rdl); + rdl->status = RDLINE_INIT; + rdline_puts(rdl, "\r\n"); + if (rdl->history_cur_line != -1) + rdline_remove_first_history_item(rdl); + + if (rdl->validate) + rdl->validate(rdl, rdl->left_buf, CIRBUF_GET_LEN(&rdl->left)+2); + /* user may have stopped rdline */ + if (rdl->status == RDLINE_EXITED) + return RDLINE_RES_EXITED; + return RDLINE_RES_VALIDATED; + + /* previous element in history */ + case CMDLINE_KEY_UP_ARR: + case CMDLINE_KEY_CTRL_P: + if (rdl->history_cur_line == 0) { + rdline_remove_first_history_item(rdl); + } + if (rdl->history_cur_line <= 0) { + rdline_add_history(rdl, rdline_get_buffer(rdl)); + rdl->history_cur_line = 0; + } + + buf = rdline_get_history_item(rdl, rdl->history_cur_line + 1); + if (!buf) + break; + + rdl->history_cur_line ++; + vt100_init(&rdl->vt100); + cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE); + cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE); + cirbuf_add_buf_tail(&rdl->left, buf, strnlen(buf, RDLINE_BUF_SIZE)); + rdline_redisplay(rdl); + break; + + /* next element in history */ + case CMDLINE_KEY_DOWN_ARR: + case CMDLINE_KEY_CTRL_N: + if (rdl->history_cur_line - 1 < 0) + break; + + rdl->history_cur_line --; + buf = rdline_get_history_item(rdl, rdl->history_cur_line); + if (!buf) + break; + vt100_init(&rdl->vt100); + cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE); + cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE); + cirbuf_add_buf_tail(&rdl->left, buf, strnlen(buf, RDLINE_BUF_SIZE)); + rdline_redisplay(rdl); + + break; + + + default: + break; + } + + return RDLINE_RES_SUCCESS; + } + + if (!isprint((int)c)) + return RDLINE_RES_SUCCESS; + + /* standard chars */ + if (CIRBUF_GET_LEN(&rdl->left) + CIRBUF_GET_LEN(&rdl->right) >= RDLINE_BUF_SIZE) + return RDLINE_RES_SUCCESS; + + if (cirbuf_add_tail_safe(&rdl->left, c)) + return RDLINE_RES_SUCCESS; + + rdl->write_char(rdl, c); + display_right_buffer(rdl, 0); + + return RDLINE_RES_SUCCESS; +} + + +/* HISTORY */ + +static void +rdline_remove_old_history_item(struct rdline * rdl) +{ + char tmp; + + while (! CIRBUF_IS_EMPTY(&rdl->history) ) { + tmp = cirbuf_get_head(&rdl->history); + cirbuf_del_head(&rdl->history); + if (!tmp) + break; + } +} + +static void +rdline_remove_first_history_item(struct rdline * rdl) +{ + char tmp; + + if ( CIRBUF_IS_EMPTY(&rdl->history) ) { + return; + } + else { + cirbuf_del_tail(&rdl->history); + } + + while (! CIRBUF_IS_EMPTY(&rdl->history) ) { + tmp = cirbuf_get_tail(&rdl->history); + if (!tmp) + break; + cirbuf_del_tail(&rdl->history); + } +} + +static unsigned int +rdline_get_history_size(struct rdline * rdl) +{ + unsigned int i, tmp, ret=0; + + CIRBUF_FOREACH(&rdl->history, i, tmp) { + if (tmp == 0) + ret ++; + } + + return ret; +} + +char * +rdline_get_history_item(struct rdline * rdl, unsigned int idx) +{ + unsigned int len, i, tmp; + + if (!rdl) + return NULL; + + len = rdline_get_history_size(rdl); + if ( idx >= len ) { + return NULL; + } + + cirbuf_align_left(&rdl->history); + + CIRBUF_FOREACH(&rdl->history, i, tmp) { + if ( idx == len - 1) { + return rdl->history_buf + i; + } + if (tmp == 0) + len --; + } + + return NULL; +} + +int +rdline_add_history(struct rdline * rdl, const char * buf) +{ + unsigned int len, i; + + if (!rdl || !buf) + return -EINVAL; + + len = strnlen(buf, RDLINE_BUF_SIZE); + for (i=0; i= RDLINE_HISTORY_BUF_SIZE ) + return -1; + + while ( len >= CIRBUF_GET_FREELEN(&rdl->history) ) { + rdline_remove_old_history_item(rdl); + } + + cirbuf_add_buf_tail(&rdl->history, buf, len); + cirbuf_add_tail(&rdl->history, 0); + + return 0; +} + +void +rdline_clear_history(struct rdline * rdl) +{ + if (!rdl) + return; + cirbuf_init(&rdl->history, rdl->history_buf, 0, RDLINE_HISTORY_BUF_SIZE); +} + + +/* STATIC USEFUL FUNCS */ + +static void +rdline_puts(struct rdline * rdl, const char * buf) +{ + char c; + while ( (c = *(buf++)) != '\0' ) { + rdl->write_char(rdl, c); + } +} + +/* a very very basic printf with one arg and one format 'u' */ +static void +rdline_miniprintf(struct rdline *rdl, const char * buf, unsigned int val) +{ + char c, started=0, div=100; + + while ( (c=*(buf++)) ) { + if (c != '%') { + rdl->write_char(rdl, c); + continue; + } + c = *(buf++); + if (c != 'u') { + rdl->write_char(rdl, '%'); + rdl->write_char(rdl, c); + continue; + } + /* val is never more than 255 */ + while (div) { + c = (char)(val / div); + if (c || started) { + rdl->write_char(rdl, (char)(c+'0')); + started = 1; + } + val %= div; + div /= 10; + } + } +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.h new file mode 100644 index 00000000..d2170293 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_rdline.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _RDLINE_H_ +#define _RDLINE_H_ + +/** + * This file is a small equivalent to the GNU readline library, but it + * was originally designed for small systems, like Atmel AVR + * microcontrollers (8 bits). Indeed, we don't use any malloc that is + * sometimes not implemented (or just not recommended) on such + * systems. + * + * Obviously, it does not support as many things as the GNU readline, + * but at least it supports some interesting features like a kill + * buffer and a command history. + * + * It also have a feature that does not have the GNU readline (as far + * as I know): we can have several instances of it running at the same + * time, even on a monothread program, since it works with callbacks. + * + * The lib is designed for a client-side or a server-side use: + * - server-side: the server receives all data from a socket, including + * control chars, like arrows, tabulations, ... The client is + * very simple, it can be a telnet or a minicom through a serial line. + * - client-side: the client receives its data through its stdin for + * instance. + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* configuration */ +#define RDLINE_BUF_SIZE 512 +#define RDLINE_PROMPT_SIZE 32 +#define RDLINE_VT100_BUF_SIZE 8 +#define RDLINE_HISTORY_BUF_SIZE BUFSIZ +#define RDLINE_HISTORY_MAX_LINE 64 + +enum rdline_status { + RDLINE_INIT, + RDLINE_RUNNING, + RDLINE_EXITED +}; + +struct rdline; + +typedef int (rdline_write_char_t)(struct rdline *rdl, char); +typedef void (rdline_validate_t)(struct rdline *rdl, + const char *buf, unsigned int size); +typedef int (rdline_complete_t)(struct rdline *rdl, const char *buf, + char *dstbuf, unsigned int dstsize, + int *state); + +struct rdline { + enum rdline_status status; + /* rdline bufs */ + struct cirbuf left; + struct cirbuf right; + char left_buf[RDLINE_BUF_SIZE+2]; /* reserve 2 chars for the \n\0 */ + char right_buf[RDLINE_BUF_SIZE]; + + char prompt[RDLINE_PROMPT_SIZE]; + unsigned int prompt_size; + + char kill_buf[RDLINE_BUF_SIZE]; + unsigned int kill_size; + + /* history */ + struct cirbuf history; + char history_buf[RDLINE_HISTORY_BUF_SIZE]; + int history_cur_line; + + /* callbacks and func pointers */ + rdline_write_char_t *write_char; + rdline_validate_t *validate; + rdline_complete_t *complete; + + /* vt100 parser */ + struct cmdline_vt100 vt100; + + /* opaque pointer */ + void *opaque; +}; + +/** + * Init fields for a struct rdline. Call this only once at the beginning + * of your program. + * \param rdl A pointer to an uninitialized struct rdline + * \param write_char The function used by the function to write a character + * \param validate A pointer to the function to execute when the + * user validates the buffer. + * \param complete A pointer to the function to execute when the + * user completes the buffer. + */ +int rdline_init(struct rdline *rdl, + rdline_write_char_t *write_char, + rdline_validate_t *validate, + rdline_complete_t *complete); + + +/** + * Init the current buffer, and display a prompt. + * \param rdl A pointer to a struct rdline + * \param prompt A string containing the prompt + */ +void rdline_newline(struct rdline *rdl, const char *prompt); + +/** + * Call it and all received chars will be ignored. + * \param rdl A pointer to a struct rdline + */ +void rdline_stop(struct rdline *rdl); + +/** + * Same than rdline_stop() except that next calls to rdline_char_in() + * will return RDLINE_RES_EXITED. + * \param rdl A pointer to a struct rdline + */ +void rdline_quit(struct rdline *rdl); + +/** + * Restart after a call to rdline_stop() or rdline_quit() + * \param rdl A pointer to a struct rdline + */ +void rdline_restart(struct rdline *rdl); + +/** + * Redisplay the current buffer + * \param rdl A pointer to a struct rdline + */ +void rdline_redisplay(struct rdline *rdl); + +/** + * Reset the current buffer and setup for a new line. + * \param rdl A pointer to a struct rdline + */ +void rdline_reset(struct rdline *rdl); + + +/* return status for rdline_char_in() */ +#define RDLINE_RES_SUCCESS 0 +#define RDLINE_RES_VALIDATED 1 +#define RDLINE_RES_COMPLETE 2 +#define RDLINE_RES_NOT_RUNNING -1 +#define RDLINE_RES_EOF -2 +#define RDLINE_RES_EXITED -3 + +/** + * append a char to the readline buffer. + * Return RDLINE_RES_VALIDATE when the line has been validated. + * Return RDLINE_RES_COMPLETE when the user asked to complete the buffer. + * Return RDLINE_RES_NOT_RUNNING if it is not running. + * Return RDLINE_RES_EOF if EOF (ctrl-d on an empty line). + * Else return RDLINE_RES_SUCCESS. + * XXX error case when the buffer is full ? + * + * \param rdl A pointer to a struct rdline + * \param c The character to append + */ +int rdline_char_in(struct rdline *rdl, char c); + +/** + * Return the current buffer, terminated by '\0'. + * \param rdl A pointer to a struct rdline + */ +const char *rdline_get_buffer(struct rdline *rdl); + + +/** + * Add the buffer to history. + * return < 0 on error. + * \param rdl A pointer to a struct rdline + * \param buf A buffer that is terminated by '\0' + */ +int rdline_add_history(struct rdline *rdl, const char *buf); + +/** + * Clear current history + * \param rdl A pointer to a struct rdline + */ +void rdline_clear_history(struct rdline *rdl); + +/** + * Get the i-th history item + */ +char *rdline_get_history_item(struct rdline *rdl, unsigned int i); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDLINE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.c new file mode 100644 index 00000000..ecb3d82b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.c @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline_parse.h" +#include "cmdline_rdline.h" +#include "cmdline_socket.h" +#include "cmdline.h" + +struct cmdline * +cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path) +{ + int fd; + + /* everything else is checked in cmdline_new() */ + if (!path) + return NULL; + + fd = open(path, O_RDONLY, 0); + if (fd < 0) { + dprintf("open() failed\n"); + return NULL; + } + return cmdline_new(ctx, prompt, fd, -1); +} + +struct cmdline * +cmdline_stdin_new(cmdline_parse_ctx_t *ctx, const char *prompt) +{ + struct cmdline *cl; + struct termios oldterm, term; + + tcgetattr(0, &oldterm); + memcpy(&term, &oldterm, sizeof(term)); + term.c_lflag &= ~(ICANON | ECHO | ISIG); + tcsetattr(0, TCSANOW, &term); + setbuf(stdin, NULL); + + cl = cmdline_new(ctx, prompt, 0, 1); + + if (cl) + memcpy(&cl->oldterm, &oldterm, sizeof(term)); + + return cl; +} + +void +cmdline_stdin_exit(struct cmdline *cl) +{ + if (!cl) + return; + + tcsetattr(fileno(stdin), TCSANOW, &cl->oldterm); +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.h new file mode 100644 index 00000000..80542e55 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_socket.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _CMDLINE_SOCKET_H_ +#define _CMDLINE_SOCKET_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct cmdline *cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path); +struct cmdline *cmdline_stdin_new(cmdline_parse_ctx_t *ctx, const char *prompt); +void cmdline_stdin_exit(struct cmdline *cl); + +#ifdef __cplusplus +} +#endif + +#endif /* _CMDLINE_SOCKET_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.c b/src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.c new file mode 100644 index 00000000..662fc734 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline_vt100.h" + +const char *cmdline_vt100_commands[] = { + vt100_up_arr, + vt100_down_arr, + vt100_right_arr, + vt100_left_arr, + "\177", + "\n", + "\001", + "\005", + "\013", + "\031", + "\003", + "\006", + "\002", + vt100_suppr, + vt100_tab, + "\004", + "\014", + "\r", + "\033\177", + vt100_word_left, + vt100_word_right, + "?", + "\027", + "\020", + "\016", + "\033\144", + vt100_bs, +}; + +void +vt100_init(struct cmdline_vt100 *vt) +{ + if (!vt) + return; + vt->state = CMDLINE_VT100_INIT; +} + + +static int +match_command(char *buf, unsigned int size) +{ + const char *cmd; + size_t cmdlen; + unsigned int i = 0; + + for (i=0 ; ibufpos >= CMDLINE_VT100_BUF_SIZE) { + vt->state = CMDLINE_VT100_INIT; + vt->bufpos = 0; + } + + vt->buf[vt->bufpos++] = c; + size = vt->bufpos; + + switch (vt->state) { + case CMDLINE_VT100_INIT: + if (c == 033) { + vt->state = CMDLINE_VT100_ESCAPE; + } + else { + vt->bufpos = 0; + goto match_command; + } + break; + + case CMDLINE_VT100_ESCAPE: + if (c == 0133) { + vt->state = CMDLINE_VT100_ESCAPE_CSI; + } + else if (c >= 060 && c <= 0177) { /* XXX 0177 ? */ + vt->bufpos = 0; + vt->state = CMDLINE_VT100_INIT; + goto match_command; + } + break; + + case CMDLINE_VT100_ESCAPE_CSI: + if (c >= 0100 && c <= 0176) { + vt->bufpos = 0; + vt->state = CMDLINE_VT100_INIT; + goto match_command; + } + break; + + default: + vt->bufpos = 0; + break; + } + + return -2; + + match_command: + return match_command(vt->buf, size); +} diff --git a/src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.h b/src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.h new file mode 100644 index 00000000..e33e67ed --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/cmdline_vt100.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + */ + +#ifndef _CMDLINE_VT100_H_ +#define _CMDLINE_VT100_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define vt100_bell "\007" +#define vt100_bs "\010" +#define vt100_bs_clear "\010 \010" +#define vt100_tab "\011" +#define vt100_crnl "\012\015" +#define vt100_clear_right "\033[0K" +#define vt100_clear_left "\033[1K" +#define vt100_clear_down "\033[0J" +#define vt100_clear_up "\033[1J" +#define vt100_clear_line "\033[2K" +#define vt100_clear_screen "\033[2J" +#define vt100_up_arr "\033\133\101" +#define vt100_down_arr "\033\133\102" +#define vt100_right_arr "\033\133\103" +#define vt100_left_arr "\033\133\104" +#define vt100_multi_right "\033\133%uC" +#define vt100_multi_left "\033\133%uD" +#define vt100_suppr "\033\133\063\176" +#define vt100_home "\033M\033E" +#define vt100_word_left "\033\142" +#define vt100_word_right "\033\146" + +/* Result of parsing : it must be synchronized with + * cmdline_vt100_commands[] in vt100.c */ +#define CMDLINE_KEY_UP_ARR 0 +#define CMDLINE_KEY_DOWN_ARR 1 +#define CMDLINE_KEY_RIGHT_ARR 2 +#define CMDLINE_KEY_LEFT_ARR 3 +#define CMDLINE_KEY_BKSPACE 4 +#define CMDLINE_KEY_RETURN 5 +#define CMDLINE_KEY_CTRL_A 6 +#define CMDLINE_KEY_CTRL_E 7 +#define CMDLINE_KEY_CTRL_K 8 +#define CMDLINE_KEY_CTRL_Y 9 +#define CMDLINE_KEY_CTRL_C 10 +#define CMDLINE_KEY_CTRL_F 11 +#define CMDLINE_KEY_CTRL_B 12 +#define CMDLINE_KEY_SUPPR 13 +#define CMDLINE_KEY_TAB 14 +#define CMDLINE_KEY_CTRL_D 15 +#define CMDLINE_KEY_CTRL_L 16 +#define CMDLINE_KEY_RETURN2 17 +#define CMDLINE_KEY_META_BKSPACE 18 +#define CMDLINE_KEY_WLEFT 19 +#define CMDLINE_KEY_WRIGHT 20 +#define CMDLINE_KEY_HELP 21 +#define CMDLINE_KEY_CTRL_W 22 +#define CMDLINE_KEY_CTRL_P 23 +#define CMDLINE_KEY_CTRL_N 24 +#define CMDLINE_KEY_META_D 25 +#define CMDLINE_KEY_BKSPACE2 26 + +extern const char *cmdline_vt100_commands[]; + +enum cmdline_vt100_parser_state { + CMDLINE_VT100_INIT, + CMDLINE_VT100_ESCAPE, + CMDLINE_VT100_ESCAPE_CSI +}; + +#define CMDLINE_VT100_BUF_SIZE 8 +struct cmdline_vt100 { + uint8_t bufpos; + char buf[CMDLINE_VT100_BUF_SIZE]; + enum cmdline_vt100_parser_state state; +}; + +/** + * Init + */ +void vt100_init(struct cmdline_vt100 *vt); + +/** + * Input a new character. + * Return -1 if the character is not part of a control sequence + * Return -2 if c is not the last char of a control sequence + * Else return the index in vt100_commands[] + */ +int vt100_parser(struct cmdline_vt100 *vt, char c); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_cmdline/meson.build b/src/spdk/dpdk/lib/librte_cmdline/meson.build new file mode 100644 index 00000000..5741817a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/meson.build @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('cmdline.c', + 'cmdline_cirbuf.c', + 'cmdline_parse.c', + 'cmdline_parse_etheraddr.c', + 'cmdline_parse_ipaddr.c', + 'cmdline_parse_num.c', + 'cmdline_parse_portlist.c', + 'cmdline_parse_string.c', + 'cmdline_rdline.c', + 'cmdline_socket.c', + 'cmdline_vt100.c') + +headers = files('cmdline.h', + 'cmdline_parse.h', + 'cmdline_parse_num.h', + 'cmdline_parse_ipaddr.h', + 'cmdline_parse_etheraddr.h', + 'cmdline_parse_string.h', + 'cmdline_rdline.h', + 'cmdline_vt100.h', + 'cmdline_socket.h', + 'cmdline_cirbuf.h', + 'cmdline_parse_portlist.h') diff --git a/src/spdk/dpdk/lib/librte_cmdline/rte_cmdline_version.map b/src/spdk/dpdk/lib/librte_cmdline/rte_cmdline_version.map new file mode 100644 index 00000000..04bcb387 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cmdline/rte_cmdline_version.map @@ -0,0 +1,77 @@ +DPDK_2.0 { + global: + + cirbuf_add_buf_head; + cirbuf_add_buf_tail; + cirbuf_add_head; + cirbuf_add_head_safe; + cirbuf_add_tail; + cirbuf_add_tail_safe; + cirbuf_align_left; + cirbuf_align_right; + cirbuf_del_buf_head; + cirbuf_del_buf_tail; + cirbuf_del_head; + cirbuf_del_head_safe; + cirbuf_del_tail; + cirbuf_del_tail_safe; + cirbuf_get_buf_head; + cirbuf_get_buf_tail; + cirbuf_get_head; + cirbuf_get_tail; + cirbuf_init; + cmdline_complete; + cmdline_complete_get_elt_string; + cmdline_complete_get_nb_string; + cmdline_file_new; + cmdline_free; + cmdline_get_help_etheraddr; + cmdline_get_help_ipaddr; + cmdline_get_help_num; + cmdline_get_help_portlist; + cmdline_get_help_string; + cmdline_in; + cmdline_interact; + cmdline_isendoftoken; + cmdline_new; + cmdline_parse; + cmdline_parse_etheraddr; + cmdline_parse_ipaddr; + cmdline_parse_num; + cmdline_parse_portlist; + cmdline_parse_string; + cmdline_printf; + cmdline_quit; + cmdline_set_prompt; + cmdline_stdin_exit; + cmdline_stdin_new; + cmdline_token_etheraddr_ops; + cmdline_token_ipaddr_ops; + cmdline_token_num_ops; + cmdline_token_portlist_ops; + cmdline_token_string_ops; + cmdline_write_char; + rdline_add_history; + rdline_char_in; + rdline_clear_history; + rdline_get_buffer; + rdline_get_history_item; + rdline_init; + rdline_newline; + rdline_quit; + rdline_redisplay; + rdline_reset; + rdline_restart; + rdline_stop; + vt100_init; + vt100_parser; + + local: *; +}; + +DPDK_2.1 { + global: + + cmdline_poll; + +} DPDK_2.0; diff --git a/src/spdk/dpdk/lib/librte_compat/Makefile b/src/spdk/dpdk/lib/librte_compat/Makefile new file mode 100644 index 00000000..61089fe7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compat/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2013 Neil Horman +# All rights reserved. + +include $(RTE_SDK)/mk/rte.vars.mk + + +LIBABIVER := 1 + +# install includes +SYMLINK-y-include := rte_compat.h + +include $(RTE_SDK)/mk/rte.install.mk diff --git a/src/spdk/dpdk/lib/librte_compat/meson.build b/src/spdk/dpdk/lib/librte_compat/meson.build new file mode 100644 index 00000000..82c7eea5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compat/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + + +install_headers('rte_compat.h') + +set_variable('dep_rte_compat', + declare_dependency(include_directories: include_directories('.'))) diff --git a/src/spdk/dpdk/lib/librte_compat/rte_compat.h b/src/spdk/dpdk/lib/librte_compat/rte_compat.h new file mode 100644 index 00000000..3ffaa9e6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compat/rte_compat.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Neil Horman . + * All rights reserved. + */ + +#ifndef _RTE_COMPAT_H_ +#define _RTE_COMPAT_H_ +#include + +#ifdef RTE_BUILD_SHARED_LIB + +/* + * Provides backwards compatibility when updating exported functions. + * When a symol is exported from a library to provide an API, it also provides a + * calling convention (ABI) that is embodied in its name, return type, + * arguments, etc. On occasion that function may need to change to accommodate + * new functionality, behavior, etc. When that occurs, it is desirable to + * allow for backwards compatibility for a time with older binaries that are + * dynamically linked to the dpdk. To support that, the __vsym and + * VERSION_SYMBOL macros are created. They, in conjunction with the + * _version.map file for a given library allow for multiple versions of + * a symbol to exist in a shared library so that older binaries need not be + * immediately recompiled. + * + * Refer to the guidelines document in the docs subdirectory for details on the + * use of these macros + */ + +/* + * Macro Parameters: + * b - function base name + * e - function version extension, to be concatenated with base name + * n - function symbol version string to be applied + * f - function prototype + * p - full function symbol name + */ + +/* + * VERSION_SYMBOL + * Creates a symbol version table entry binding symbol @DPDK_ to the internal + * function name _ + */ +#define VERSION_SYMBOL(b, e, n) __asm__(".symver " RTE_STR(b) RTE_STR(e) ", " RTE_STR(b) "@DPDK_" RTE_STR(n)) + +/* + * BIND_DEFAULT_SYMBOL + * Creates a symbol version entry instructing the linker to bind references to + * symbol to the internal symbol _ + */ +#define BIND_DEFAULT_SYMBOL(b, e, n) __asm__(".symver " RTE_STR(b) RTE_STR(e) ", " RTE_STR(b) "@@DPDK_" RTE_STR(n)) +#define __vsym __attribute__((used)) + +/* + * MAP_STATIC_SYMBOL + * If a function has been bifurcated into multiple versions, none of which + * are defined as the exported symbol name in the map file, this macro can be + * used to alias a specific version of the symbol to its exported name. For + * example, if you have 2 versions of a function foo_v1 and foo_v2, where the + * former is mapped to foo@DPDK_1 and the latter is mapped to foo@DPDK_2 when + * building a shared library, this macro can be used to map either foo_v1 or + * foo_v2 to the symbol foo when building a static library, e.g.: + * MAP_STATIC_SYMBOL(void foo(), foo_v2); + */ +#define MAP_STATIC_SYMBOL(f, p) + +#else +/* + * No symbol versioning in use + */ +#define VERSION_SYMBOL(b, e, n) +#define __vsym +#define BIND_DEFAULT_SYMBOL(b, e, n) +#define MAP_STATIC_SYMBOL(f, p) f __attribute__((alias(RTE_STR(p)))) +/* + * RTE_BUILD_SHARED_LIB=n + */ +#endif + +#ifndef ALLOW_EXPERIMENTAL_API + +#define __rte_experimental __attribute__((deprecated)) __attribute__((section(".text.experimental"))) +#else + +#define __rte_experimental \ +__attribute__((section(".text.experimental"))) + +#endif + +#endif /* _RTE_COMPAT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_compressdev/Makefile b/src/spdk/dpdk/lib/librte_compressdev/Makefile new file mode 100644 index 00000000..7ef89e61 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017-2018 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_compressdev.a + +# library version +LIBABIVER := 1 + +# build flags +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -DALLOW_EXPERIMENTAL_API +LDLIBS += -lrte_eal -lrte_mempool -lrte_kvargs + +# library source files +SRCS-y += rte_compressdev.c rte_compressdev_pmd.c rte_comp.c + +# export include files +SYMLINK-y-include += rte_comp.h +SYMLINK-y-include += rte_compressdev.h +# export include files (for PMDs) +SYMLINK-y-include += rte_compressdev_pmd.h +SYMLINK-y-include += rte_compressdev_internal.h + +# versioning export map +EXPORT_MAP := rte_compressdev_version.map + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_compressdev/meson.build b/src/spdk/dpdk/lib/librte_compressdev/meson.build new file mode 100644 index 00000000..5416571c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/meson.build @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +allow_experimental_apis = true +sources = files('rte_compressdev.c', + 'rte_compressdev_pmd.c', + 'rte_comp.c') +headers = files('rte_compressdev.h', + 'rte_compressdev_pmd.h', + 'rte_compressdev_internal.h', + 'rte_comp.h') +deps += ['kvargs', 'mbuf'] diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_comp.c b/src/spdk/dpdk/lib/librte_compressdev/rte_comp.c new file mode 100644 index 00000000..98ad0cfd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_comp.c @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include "rte_comp.h" +#include "rte_compressdev.h" +#include "rte_compressdev_internal.h" + +const char * __rte_experimental +rte_comp_get_feature_name(uint64_t flag) +{ + switch (flag) { + case RTE_COMP_FF_STATEFUL_COMPRESSION: + return "STATEFUL_COMPRESSION"; + case RTE_COMP_FF_STATEFUL_DECOMPRESSION: + return "STATEFUL_DECOMPRESSION"; + case RTE_COMP_FF_OOP_SGL_IN_SGL_OUT: + return "OOP_SGL_IN_SGL_OUT"; + case RTE_COMP_FF_OOP_SGL_IN_LB_OUT: + return "OOP_SGL_IN_LB_OUT"; + case RTE_COMP_FF_OOP_LB_IN_SGL_OUT: + return "OOP_LB_IN_SGL_OUT"; + case RTE_COMP_FF_MULTI_PKT_CHECKSUM: + return "MULTI_PKT_CHECKSUM"; + case RTE_COMP_FF_ADLER32_CHECKSUM: + return "ADLER32_CHECKSUM"; + case RTE_COMP_FF_CRC32_CHECKSUM: + return "CRC32_CHECKSUM"; + case RTE_COMP_FF_CRC32_ADLER32_CHECKSUM: + return "CRC32_ADLER32_CHECKSUM"; + case RTE_COMP_FF_NONCOMPRESSED_BLOCKS: + return "NONCOMPRESSED_BLOCKS"; + case RTE_COMP_FF_SHA1_HASH: + return "SHA1_HASH"; + case RTE_COMP_FF_SHA2_SHA256_HASH: + return "SHA2_SHA256_HASH"; + case RTE_COMP_FF_SHAREABLE_PRIV_XFORM: + return "SHAREABLE_PRIV_XFORM"; + case RTE_COMP_FF_HUFFMAN_FIXED: + return "HUFFMAN_FIXED"; + case RTE_COMP_FF_HUFFMAN_DYNAMIC: + return "HUFFMAN_DYNAMIC"; + default: + return NULL; + } +} + +/** + * Reset the fields of an operation to their default values. + * + * @note The private data associated with the operation is not zeroed. + * + * @param op + * The operation to be reset + */ +static inline void +rte_comp_op_reset(struct rte_comp_op *op) +{ + struct rte_mempool *tmp_mp = op->mempool; + rte_iova_t tmp_iova_addr = op->iova_addr; + + memset(op, 0, sizeof(struct rte_comp_op)); + op->status = RTE_COMP_OP_STATUS_NOT_PROCESSED; + op->iova_addr = tmp_iova_addr; + op->mempool = tmp_mp; +} + +/** + * Private data structure belonging to an operation pool. + */ +struct rte_comp_op_pool_private { + uint16_t user_size; + /**< Size of private user data with each operation. */ +}; + +/** + * Bulk allocate raw element from mempool and return as comp operations + * + * @param mempool + * Compress operation mempool + * @param ops + * Array to place allocated operations + * @param nb_ops + * Number of operations to allocate + * @return + * - 0: Success + * - -ENOENT: Not enough entries in the mempool; no ops are retrieved. + */ +static inline int +rte_comp_op_raw_bulk_alloc(struct rte_mempool *mempool, + struct rte_comp_op **ops, uint16_t nb_ops) +{ + if (rte_mempool_get_bulk(mempool, (void **)ops, nb_ops) == 0) + return nb_ops; + + return 0; +} + +/** Initialise rte_comp_op mempool element */ +static void +rte_comp_op_init(struct rte_mempool *mempool, + __rte_unused void *opaque_arg, + void *_op_data, + __rte_unused unsigned int i) +{ + struct rte_comp_op *op = _op_data; + + memset(_op_data, 0, mempool->elt_size); + + op->status = RTE_COMP_OP_STATUS_NOT_PROCESSED; + op->iova_addr = rte_mem_virt2iova(_op_data); + op->mempool = mempool; +} + +struct rte_mempool * __rte_experimental +rte_comp_op_pool_create(const char *name, + unsigned int nb_elts, unsigned int cache_size, + uint16_t user_size, int socket_id) +{ + struct rte_comp_op_pool_private *priv; + + unsigned int elt_size = sizeof(struct rte_comp_op) + user_size; + + /* lookup mempool in case already allocated */ + struct rte_mempool *mp = rte_mempool_lookup(name); + + if (mp != NULL) { + priv = (struct rte_comp_op_pool_private *) + rte_mempool_get_priv(mp); + + if (mp->elt_size != elt_size || + mp->cache_size < cache_size || + mp->size < nb_elts || + priv->user_size < user_size) { + mp = NULL; + COMPRESSDEV_LOG(ERR, + "Mempool %s already exists but with incompatible parameters", + name); + return NULL; + } + return mp; + } + + mp = rte_mempool_create( + name, + nb_elts, + elt_size, + cache_size, + sizeof(struct rte_comp_op_pool_private), + NULL, + NULL, + rte_comp_op_init, + NULL, + socket_id, + 0); + + if (mp == NULL) { + COMPRESSDEV_LOG(ERR, "Failed to create mempool %s", name); + return NULL; + } + + priv = (struct rte_comp_op_pool_private *) + rte_mempool_get_priv(mp); + + priv->user_size = user_size; + + return mp; +} + +struct rte_comp_op * __rte_experimental +rte_comp_op_alloc(struct rte_mempool *mempool) +{ + struct rte_comp_op *op = NULL; + int retval; + + retval = rte_comp_op_raw_bulk_alloc(mempool, &op, 1); + if (unlikely(retval < 0)) + return NULL; + + rte_comp_op_reset(op); + + return op; +} + +int __rte_experimental +rte_comp_op_bulk_alloc(struct rte_mempool *mempool, + struct rte_comp_op **ops, uint16_t nb_ops) +{ + int ret; + uint16_t i; + + ret = rte_comp_op_raw_bulk_alloc(mempool, ops, nb_ops); + if (unlikely(ret < nb_ops)) + return ret; + + for (i = 0; i < nb_ops; i++) + rte_comp_op_reset(ops[i]); + + return nb_ops; +} + +/** + * free operation structure + * If operation has been allocate from a rte_mempool, then the operation will + * be returned to the mempool. + * + * @param op + * Compress operation + */ +void __rte_experimental +rte_comp_op_free(struct rte_comp_op *op) +{ + if (op != NULL && op->mempool != NULL) + rte_mempool_put(op->mempool, op); +} diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_comp.h b/src/spdk/dpdk/lib/librte_compressdev/rte_comp.h new file mode 100644 index 00000000..ee9056ea --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_comp.h @@ -0,0 +1,485 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef _RTE_COMP_H_ +#define _RTE_COMP_H_ + +/** + * @file rte_comp.h + * + * RTE definitions for Data Compression Service + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/** + * compression service feature flags + * + * @note New features flags should be added to the end of the list + * + * Keep these flags synchronised with rte_comp_get_feature_name() + */ +#define RTE_COMP_FF_STATEFUL_COMPRESSION (1ULL << 0) +/**< Stateful compression is supported */ +#define RTE_COMP_FF_STATEFUL_DECOMPRESSION (1ULL << 1) +/**< Stateful decompression is supported */ +#define RTE_COMP_FF_OOP_SGL_IN_SGL_OUT (1ULL << 2) +/**< Out-of-place Scatter-gather (SGL) buffers, + * with multiple segments, are supported in input and output + */ +#define RTE_COMP_FF_OOP_SGL_IN_LB_OUT (1ULL << 3) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in input, combined with linear buffers (LB), with a + * single segment, in output + */ +#define RTE_COMP_FF_OOP_LB_IN_SGL_OUT (1ULL << 4) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in output, combined with linear buffers (LB) in input + */ +#define RTE_COMP_FF_ADLER32_CHECKSUM (1ULL << 5) +/**< Adler-32 Checksum is supported */ +#define RTE_COMP_FF_CRC32_CHECKSUM (1ULL << 6) +/**< CRC32 Checksum is supported */ +#define RTE_COMP_FF_CRC32_ADLER32_CHECKSUM (1ULL << 7) +/**< Adler-32/CRC32 Checksum is supported */ +#define RTE_COMP_FF_MULTI_PKT_CHECKSUM (1ULL << 8) +/**< Generation of checksum across multiple stateless packets is supported */ +#define RTE_COMP_FF_SHA1_HASH (1ULL << 9) +/**< SHA1 Hash is supported */ +#define RTE_COMP_FF_SHA2_SHA256_HASH (1ULL << 10) +/**< SHA256 Hash of SHA2 family is supported */ +#define RTE_COMP_FF_NONCOMPRESSED_BLOCKS (1ULL << 11) +/**< Creation of non-compressed blocks using RTE_COMP_LEVEL_NONE is supported */ +#define RTE_COMP_FF_SHAREABLE_PRIV_XFORM (1ULL << 12) +/**< Private xforms created by the PMD can be shared + * across multiple stateless operations. If not set, then app needs + * to create as many priv_xforms as it expects to have stateless + * operations in-flight. + */ +#define RTE_COMP_FF_HUFFMAN_FIXED (1ULL << 13) +/**< Fixed huffman encoding is supported */ +#define RTE_COMP_FF_HUFFMAN_DYNAMIC (1ULL << 14) +/**< Dynamic huffman encoding is supported */ + +/** Status of comp operation */ +enum rte_comp_op_status { + RTE_COMP_OP_STATUS_SUCCESS = 0, + /**< Operation completed successfully */ + RTE_COMP_OP_STATUS_NOT_PROCESSED, + /**< Operation has not yet been processed by the device */ + RTE_COMP_OP_STATUS_INVALID_ARGS, + /**< Operation failed due to invalid arguments in request */ + RTE_COMP_OP_STATUS_ERROR, + /**< Error handling operation */ + RTE_COMP_OP_STATUS_INVALID_STATE, + /**< Operation is invoked in invalid state */ + RTE_COMP_OP_STATUS_OUT_OF_SPACE_TERMINATED, + /**< Output buffer ran out of space before operation completed. + * Error case. Application must resubmit all data with a larger + * output buffer. + */ + RTE_COMP_OP_STATUS_OUT_OF_SPACE_RECOVERABLE, + /**< Output buffer ran out of space before operation completed, but this + * is not an error case. Output data up to op.produced can be used and + * next op in the stream should continue on from op.consumed+1. + */ +}; + +/** Compression Algorithms */ +enum rte_comp_algorithm { + RTE_COMP_ALGO_UNSPECIFIED = 0, + /** No Compression algorithm */ + RTE_COMP_ALGO_NULL, + /**< No compression. + * Pass-through, data is copied unchanged from source buffer to + * destination buffer. + */ + RTE_COMP_ALGO_DEFLATE, + /**< DEFLATE compression algorithm + * https://tools.ietf.org/html/rfc1951 + */ + RTE_COMP_ALGO_LZS, + /**< LZS compression algorithm + * https://tools.ietf.org/html/rfc2395 + */ + RTE_COMP_ALGO_LIST_END +}; + +/** Compression Hash Algorithms */ +enum rte_comp_hash_algorithm { + RTE_COMP_HASH_ALGO_NONE = 0, + /**< No hash */ + RTE_COMP_HASH_ALGO_SHA1, + /**< SHA1 hash algorithm */ + RTE_COMP_HASH_ALGO_SHA2_256, + /**< SHA256 hash algorithm of SHA2 family */ + RTE_COMP_HASH_ALGO_LIST_END +}; + +/**< Compression Level. + * The number is interpreted by each PMD differently. However, lower numbers + * give fastest compression, at the expense of compression ratio while + * higher numbers may give better compression ratios but are likely slower. + */ +#define RTE_COMP_LEVEL_PMD_DEFAULT (-1) +/** Use PMD Default */ +#define RTE_COMP_LEVEL_NONE (0) +/** Output uncompressed blocks if supported by the specified algorithm */ +#define RTE_COMP_LEVEL_MIN (1) +/** Use minimum compression level supported by the PMD */ +#define RTE_COMP_LEVEL_MAX (9) +/** Use maximum compression level supported by the PMD */ + +/** Compression checksum types */ +enum rte_comp_checksum_type { + RTE_COMP_CHECKSUM_NONE, + /**< No checksum generated */ + RTE_COMP_CHECKSUM_CRC32, + /**< Generates a CRC32 checksum, as used by gzip */ + RTE_COMP_CHECKSUM_ADLER32, + /**< Generates an Adler-32 checksum, as used by zlib */ + RTE_COMP_CHECKSUM_CRC32_ADLER32, + /**< Generates both Adler-32 and CRC32 checksums, concatenated. + * CRC32 is in the lower 32bits, Adler-32 in the upper 32 bits. + */ +}; + + +/** Compression Huffman Type - used by DEFLATE algorithm */ +enum rte_comp_huffman { + RTE_COMP_HUFFMAN_DEFAULT, + /**< PMD may choose which Huffman codes to use */ + RTE_COMP_HUFFMAN_FIXED, + /**< Use Fixed Huffman codes */ + RTE_COMP_HUFFMAN_DYNAMIC, + /**< Use Dynamic Huffman codes */ +}; + +/** Compression flush flags */ +enum rte_comp_flush_flag { + RTE_COMP_FLUSH_NONE, + /**< Data is not flushed. Output may remain in the compressor and be + * processed during a following op. It may not be possible to decompress + * output until a later op with some other flush flag has been sent. + */ + RTE_COMP_FLUSH_SYNC, + /**< All data should be flushed to output buffer. Output data can be + * decompressed. However state and history is not cleared, so future + * operations may use history from this operation. + */ + RTE_COMP_FLUSH_FULL, + /**< All data should be flushed to output buffer. Output data can be + * decompressed. State and history data is cleared, so future + * ops will be independent of ops processed before this. + */ + RTE_COMP_FLUSH_FINAL + /**< Same as RTE_COMP_FLUSH_FULL but if op.algo is RTE_COMP_ALGO_DEFLATE + * then bfinal bit is set in the last block. + */ +}; + +/** Compression transform types */ +enum rte_comp_xform_type { + RTE_COMP_COMPRESS, + /**< Compression service - compress */ + RTE_COMP_DECOMPRESS, + /**< Compression service - decompress */ +}; + +/** Compression operation type */ +enum rte_comp_op_type { + RTE_COMP_OP_STATELESS, + /**< All data to be processed is submitted in the op, no state or + * history from previous ops is used and none will be stored for future + * ops. Flush flag must be set to either FLUSH_FULL or FLUSH_FINAL. + */ + RTE_COMP_OP_STATEFUL + /**< There may be more data to be processed after this op, it's part of + * a stream of data. State and history from previous ops can be used + * and resulting state and history can be stored for future ops, + * depending on flush flag. + */ +}; + + +/** Parameters specific to the deflate algorithm */ +struct rte_comp_deflate_params { + enum rte_comp_huffman huffman; + /**< Compression huffman encoding type */ +}; + +/** Setup Data for compression */ +struct rte_comp_compress_xform { + enum rte_comp_algorithm algo; + /**< Algorithm to use for compress operation */ + union { + struct rte_comp_deflate_params deflate; + /**< Parameters specific to the deflate algorithm */ + }; /**< Algorithm specific parameters */ + int level; + /**< Compression level */ + uint8_t window_size; + /**< Base two log value of sliding window to be used. If window size + * can't be supported by the PMD then it may fall back to a smaller + * size. This is likely to result in a worse compression ratio. + */ + enum rte_comp_checksum_type chksum; + /**< Type of checksum to generate on the uncompressed data */ + enum rte_comp_hash_algorithm hash_algo; + /**< Hash algorithm to be used with compress operation. Hash is always + * done on plaintext. + */ +}; + +/** + * Setup Data for decompression. + */ +struct rte_comp_decompress_xform { + enum rte_comp_algorithm algo; + /**< Algorithm to use for decompression */ + enum rte_comp_checksum_type chksum; + /**< Type of checksum to generate on the decompressed data */ + uint8_t window_size; + /**< Base two log value of sliding window which was used to generate + * compressed data. If window size can't be supported by the PMD then + * setup of stream or private_xform should fail. + */ + enum rte_comp_hash_algorithm hash_algo; + /**< Hash algorithm to be used with decompress operation. Hash is always + * done on plaintext. + */ +}; + +/** + * Compression transform structure. + * + * This is used to specify the compression transforms required. + * Each transform structure can hold a single transform, the type field is + * used to specify which transform is contained within the union. + */ +struct rte_comp_xform { + enum rte_comp_xform_type type; + /**< xform type */ + union { + struct rte_comp_compress_xform compress; + /**< xform for compress operation */ + struct rte_comp_decompress_xform decompress; + /**< decompress xform */ + }; +}; + +/** + * Compression Operation. + * + * This structure contains data relating to performing a compression + * operation on the referenced mbuf data buffers. + * + * Comp operations are enqueued and dequeued in comp PMDs using the + * rte_compressdev_enqueue_burst() / rte_compressdev_dequeue_burst() APIs + */ +struct rte_comp_op { + enum rte_comp_op_type op_type; + union { + void *private_xform; + /**< Stateless private PMD data derived from an rte_comp_xform. + * A handle returned by rte_compressdev_private_xform_create() + * must be attached to operations of op_type RTE_COMP_STATELESS. + */ + void *stream; + /**< Private PMD data derived initially from an rte_comp_xform, + * which holds state and history data and evolves as operations + * are processed. rte_compressdev_stream_create() must be called + * on a device for all STATEFUL data streams and the resulting + * stream attached to the one or more operations associated + * with the data stream. + * All operations in a stream must be sent to the same device. + */ + }; + + struct rte_mempool *mempool; + /**< Pool from which operation is allocated */ + rte_iova_t iova_addr; + /**< IOVA address of this operation */ + struct rte_mbuf *m_src; + /**< source mbuf + * The total size of the input buffer(s) can be retrieved using + * rte_pktmbuf_data_len(m_src). The max data size which can fit in a + * single mbuf is limited by the uint16_t rte_mbuf.data_len to 64k-1. + * If the input data is bigger than this it can be passed to the PMD in + * a chain of mbufs if the PMD's capabilities indicate it supports this. + */ + struct rte_mbuf *m_dst; + /**< destination mbuf + * The total size of the output buffer(s) can be retrieved using + * rte_pktmbuf_data_len(m_dst). The max data size which can fit in a + * single mbuf is limited by the uint16_t rte_mbuf.data_len to 64k-1. + * If the output data is expected to be bigger than this a chain of + * mbufs can be passed to the PMD if the PMD's capabilities indicate + * it supports this. + */ + + struct { + uint32_t offset; + /**< Starting point for compression or decompression, + * specified as number of bytes from start of packet in + * source buffer. + * This offset starts from the first segment + * of the buffer, in case the m_src is a chain of mbufs. + * Starting point for checksum generation in compress direction. + */ + uint32_t length; + /**< The length, in bytes, of the data in source buffer + * to be compressed or decompressed. + * Also the length of the data over which the checksum + * should be generated in compress direction + */ + } src; + struct { + uint32_t offset; + /**< Starting point for writing output data, specified as + * number of bytes from start of packet in dest + * buffer. + * This offset starts from the first segment + * of the buffer, in case the m_dst is a chain of mbufs. + * Starting point for checksum generation in + * decompress direction. + */ + } dst; + struct { + uint8_t *digest; + /**< Output buffer to store hash output, if enabled in xform. + * Buffer would contain valid value only after an op with + * flush flag = RTE_COMP_FLUSH_FULL/FLUSH_FINAL is processed + * successfully. + * + * Length of buffer should be contiguous and large enough to + * accommodate digest produced by specific hash algo. + */ + rte_iova_t iova_addr; + /**< IO address of the buffer */ + } hash; + enum rte_comp_flush_flag flush_flag; + /**< Defines flush characteristics for the output data. + * Only applicable in compress direction + */ + uint64_t input_chksum; + /**< An input checksum can be provided to generate a + * cumulative checksum across sequential blocks in a STATELESS stream. + * Checksum type is as specified in xform chksum_type + */ + uint64_t output_chksum; + /**< If a checksum is generated it will be written in here. + * Checksum type is as specified in xform chksum_type. + */ + uint32_t consumed; + /**< The number of bytes from the source buffer + * which were compressed/decompressed. + */ + uint32_t produced; + /**< The number of bytes written to the destination buffer + * which were compressed/decompressed. + */ + uint64_t debug_status; + /**< + * Status of the operation is returned in the status param. + * This field allows the PMD to pass back extra + * pmd-specific debug information. Value is not defined on the API. + */ + uint8_t status; + /**< + * Operation status - use values from enum rte_comp_status. + * This is reset to + * RTE_COMP_OP_STATUS_NOT_PROCESSED on allocation from mempool and + * will be set to RTE_COMP_OP_STATUS_SUCCESS after operation + * is successfully processed by a PMD + */ +} __rte_cache_aligned; + +/** + * Creates an operation pool + * + * @param name + * Compress pool name + * @param nb_elts + * Number of elements in pool + * @param cache_size + * Number of elements to cache on lcore, see + * *rte_mempool_create* for further details about cache size + * @param user_size + * Size of private data to allocate for user with each operation + * @param socket_id + * Socket to identifier allocate memory on + * @return + * - On success pointer to mempool + * - On failure NULL + */ +struct rte_mempool * __rte_experimental +rte_comp_op_pool_create(const char *name, + unsigned int nb_elts, unsigned int cache_size, + uint16_t user_size, int socket_id); + +/** + * Allocate an operation from a mempool with default parameters set + * + * @param mempool + * Compress operation mempool + * + * @return + * - On success returns a valid rte_comp_op structure + * - On failure returns NULL + */ +struct rte_comp_op * __rte_experimental +rte_comp_op_alloc(struct rte_mempool *mempool); + +/** + * Bulk allocate operations from a mempool with default parameters set + * + * @param mempool + * Compress operation mempool + * @param ops + * Array to place allocated operations + * @param nb_ops + * Number of operations to allocate + * @return + * - 0: Success + * - -ENOENT: Not enough entries in the mempool; no ops are retrieved. + */ +int __rte_experimental +rte_comp_op_bulk_alloc(struct rte_mempool *mempool, + struct rte_comp_op **ops, uint16_t nb_ops); + +/** + * Free operation structure + * If operation has been allocate from a rte_mempool, then the operation will + * be returned to the mempool. + * + * @param op + * Compress operation + */ +void __rte_experimental +rte_comp_op_free(struct rte_comp_op *op); + +/** + * Get the name of a compress service feature flag + * + * @param flag + * The mask describing the flag + * + * @return + * The name of this flag, or NULL if it's not a valid feature flag. + */ +const char * __rte_experimental +rte_comp_get_feature_name(uint64_t flag); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_COMP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.c b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.c new file mode 100644 index 00000000..9091dd6e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.c @@ -0,0 +1,772 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "rte_compressdev.h" +#include "rte_compressdev_internal.h" +#include "rte_compressdev_pmd.h" + +#define RTE_COMPRESSDEV_DETACHED (0) +#define RTE_COMPRESSDEV_ATTACHED (1) + +struct rte_compressdev rte_comp_devices[RTE_COMPRESS_MAX_DEVS]; + +struct rte_compressdev *rte_compressdevs = &rte_comp_devices[0]; + +static struct rte_compressdev_global compressdev_globals = { + .devs = &rte_comp_devices[0], + .data = { NULL }, + .nb_devs = 0, + .max_devs = RTE_COMPRESS_MAX_DEVS +}; + +struct rte_compressdev_global *rte_compressdev_globals = &compressdev_globals; + +const struct rte_compressdev_capabilities * __rte_experimental +rte_compressdev_capability_get(uint8_t dev_id, + enum rte_comp_algorithm algo) +{ + const struct rte_compressdev_capabilities *capability; + struct rte_compressdev_info dev_info; + int i = 0; + + if (dev_id >= compressdev_globals.nb_devs) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%d", dev_id); + return NULL; + } + rte_compressdev_info_get(dev_id, &dev_info); + + while ((capability = &dev_info.capabilities[i++])->algo != + RTE_COMP_ALGO_UNSPECIFIED){ + if (capability->algo == algo) + return capability; + } + + return NULL; +} + +const char * __rte_experimental +rte_compressdev_get_feature_name(uint64_t flag) +{ + switch (flag) { + case RTE_COMPDEV_FF_HW_ACCELERATED: + return "HW_ACCELERATED"; + case RTE_COMPDEV_FF_CPU_SSE: + return "CPU_SSE"; + case RTE_COMPDEV_FF_CPU_AVX: + return "CPU_AVX"; + case RTE_COMPDEV_FF_CPU_AVX2: + return "CPU_AVX2"; + case RTE_COMPDEV_FF_CPU_AVX512: + return "CPU_AVX512"; + case RTE_COMPDEV_FF_CPU_NEON: + return "CPU_NEON"; + default: + return NULL; + } +} + +static struct rte_compressdev * +rte_compressdev_get_dev(uint8_t dev_id) +{ + return &rte_compressdev_globals->devs[dev_id]; +} + +struct rte_compressdev * __rte_experimental +rte_compressdev_pmd_get_named_dev(const char *name) +{ + struct rte_compressdev *dev; + unsigned int i; + + if (name == NULL) + return NULL; + + for (i = 0; i < rte_compressdev_globals->max_devs; i++) { + dev = &rte_compressdev_globals->devs[i]; + + if ((dev->attached == RTE_COMPRESSDEV_ATTACHED) && + (strcmp(dev->data->name, name) == 0)) + return dev; + } + + return NULL; +} + +static unsigned int +rte_compressdev_is_valid_dev(uint8_t dev_id) +{ + struct rte_compressdev *dev = NULL; + + if (dev_id >= rte_compressdev_globals->nb_devs) + return 0; + + dev = rte_compressdev_get_dev(dev_id); + if (dev->attached != RTE_COMPRESSDEV_ATTACHED) + return 0; + else + return 1; +} + + +int __rte_experimental +rte_compressdev_get_dev_id(const char *name) +{ + unsigned int i; + + if (name == NULL) + return -1; + + for (i = 0; i < rte_compressdev_globals->nb_devs; i++) + if ((strcmp(rte_compressdev_globals->devs[i].data->name, name) + == 0) && + (rte_compressdev_globals->devs[i].attached == + RTE_COMPRESSDEV_ATTACHED)) + return i; + + return -1; +} + +uint8_t __rte_experimental +rte_compressdev_count(void) +{ + return rte_compressdev_globals->nb_devs; +} + +uint8_t __rte_experimental +rte_compressdev_devices_get(const char *driver_name, uint8_t *devices, + uint8_t nb_devices) +{ + uint8_t i, count = 0; + struct rte_compressdev *devs = rte_compressdev_globals->devs; + uint8_t max_devs = rte_compressdev_globals->max_devs; + + for (i = 0; i < max_devs && count < nb_devices; i++) { + + if (devs[i].attached == RTE_COMPRESSDEV_ATTACHED) { + int cmp; + + cmp = strncmp(devs[i].device->driver->name, + driver_name, + strlen(driver_name)); + + if (cmp == 0) + devices[count++] = devs[i].data->dev_id; + } + } + + return count; +} + +int __rte_experimental +rte_compressdev_socket_id(uint8_t dev_id) +{ + struct rte_compressdev *dev; + + if (!rte_compressdev_is_valid_dev(dev_id)) + return -1; + + dev = rte_compressdev_get_dev(dev_id); + + return dev->data->socket_id; +} + +static inline int +rte_compressdev_data_alloc(uint8_t dev_id, struct rte_compressdev_data **data, + int socket_id) +{ + char mz_name[RTE_COMPRESSDEV_NAME_MAX_LEN]; + const struct rte_memzone *mz; + int n; + + /* generate memzone name */ + n = snprintf(mz_name, sizeof(mz_name), + "rte_compressdev_data_%u", dev_id); + if (n >= (int)sizeof(mz_name)) + return -EINVAL; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + mz = rte_memzone_reserve(mz_name, + sizeof(struct rte_compressdev_data), + socket_id, 0); + } else + mz = rte_memzone_lookup(mz_name); + + if (mz == NULL) + return -ENOMEM; + + *data = mz->addr; + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + memset(*data, 0, sizeof(struct rte_compressdev_data)); + + return 0; +} + +static uint8_t +rte_compressdev_find_free_device_index(void) +{ + uint8_t dev_id; + + for (dev_id = 0; dev_id < RTE_COMPRESS_MAX_DEVS; dev_id++) { + if (rte_comp_devices[dev_id].attached == + RTE_COMPRESSDEV_DETACHED) + return dev_id; + } + return RTE_COMPRESS_MAX_DEVS; +} + +struct rte_compressdev * __rte_experimental +rte_compressdev_pmd_allocate(const char *name, int socket_id) +{ + struct rte_compressdev *compressdev; + uint8_t dev_id; + + if (rte_compressdev_pmd_get_named_dev(name) != NULL) { + COMPRESSDEV_LOG(ERR, + "comp device with name %s already allocated!", name); + return NULL; + } + + dev_id = rte_compressdev_find_free_device_index(); + if (dev_id == RTE_COMPRESS_MAX_DEVS) { + COMPRESSDEV_LOG(ERR, "Reached maximum number of comp devices"); + return NULL; + } + compressdev = rte_compressdev_get_dev(dev_id); + + if (compressdev->data == NULL) { + struct rte_compressdev_data *compressdev_data = + compressdev_globals.data[dev_id]; + + int retval = rte_compressdev_data_alloc(dev_id, + &compressdev_data, socket_id); + + if (retval < 0 || compressdev_data == NULL) + return NULL; + + compressdev->data = compressdev_data; + + snprintf(compressdev->data->name, RTE_COMPRESSDEV_NAME_MAX_LEN, + "%s", name); + + compressdev->data->dev_id = dev_id; + compressdev->data->socket_id = socket_id; + compressdev->data->dev_started = 0; + + compressdev->attached = RTE_COMPRESSDEV_ATTACHED; + + compressdev_globals.nb_devs++; + } + + return compressdev; +} + +int __rte_experimental +rte_compressdev_pmd_release_device(struct rte_compressdev *compressdev) +{ + int ret; + + if (compressdev == NULL) + return -EINVAL; + + /* Close device only if device operations have been set */ + if (compressdev->dev_ops) { + ret = rte_compressdev_close(compressdev->data->dev_id); + if (ret < 0) + return ret; + } + + compressdev->attached = RTE_COMPRESSDEV_DETACHED; + compressdev_globals.nb_devs--; + return 0; +} + +uint16_t __rte_experimental +rte_compressdev_queue_pair_count(uint8_t dev_id) +{ + struct rte_compressdev *dev; + + dev = &rte_comp_devices[dev_id]; + return dev->data->nb_queue_pairs; +} + +static int +rte_compressdev_queue_pairs_config(struct rte_compressdev *dev, + uint16_t nb_qpairs, int socket_id) +{ + struct rte_compressdev_info dev_info; + void **qp; + unsigned int i; + + if ((dev == NULL) || (nb_qpairs < 1)) { + COMPRESSDEV_LOG(ERR, "invalid param: dev %p, nb_queues %u", + dev, nb_qpairs); + return -EINVAL; + } + + COMPRESSDEV_LOG(DEBUG, "Setup %d queues pairs on device %u", + nb_qpairs, dev->data->dev_id); + + memset(&dev_info, 0, sizeof(struct rte_compressdev_info)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + (*dev->dev_ops->dev_infos_get)(dev, &dev_info); + + if ((dev_info.max_nb_queue_pairs != 0) && + (nb_qpairs > dev_info.max_nb_queue_pairs)) { + COMPRESSDEV_LOG(ERR, "Invalid num queue_pairs (%u) for dev %u", + nb_qpairs, dev->data->dev_id); + return -EINVAL; + } + + if (dev->data->queue_pairs == NULL) { /* first time configuration */ + dev->data->queue_pairs = rte_zmalloc_socket( + "compressdev->queue_pairs", + sizeof(dev->data->queue_pairs[0]) * nb_qpairs, + RTE_CACHE_LINE_SIZE, socket_id); + + if (dev->data->queue_pairs == NULL) { + dev->data->nb_queue_pairs = 0; + COMPRESSDEV_LOG(ERR, + "failed to get memory for qp meta data, nb_queues %u", + nb_qpairs); + return -(ENOMEM); + } + } else { /* re-configure */ + int ret; + uint16_t old_nb_queues = dev->data->nb_queue_pairs; + + qp = dev->data->queue_pairs; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_release, + -ENOTSUP); + + for (i = nb_qpairs; i < old_nb_queues; i++) { + ret = (*dev->dev_ops->queue_pair_release)(dev, i); + if (ret < 0) + return ret; + } + + qp = rte_realloc(qp, sizeof(qp[0]) * nb_qpairs, + RTE_CACHE_LINE_SIZE); + if (qp == NULL) { + COMPRESSDEV_LOG(ERR, + "failed to realloc qp meta data, nb_queues %u", + nb_qpairs); + return -(ENOMEM); + } + + if (nb_qpairs > old_nb_queues) { + uint16_t new_qs = nb_qpairs - old_nb_queues; + + memset(qp + old_nb_queues, 0, + sizeof(qp[0]) * new_qs); + } + + dev->data->queue_pairs = qp; + + } + dev->data->nb_queue_pairs = nb_qpairs; + return 0; +} + +static int +rte_compressdev_queue_pairs_release(struct rte_compressdev *dev) +{ + uint16_t num_qps, i; + int ret; + + if (dev == NULL) { + COMPRESSDEV_LOG(ERR, "invalid param: dev %p", dev); + return -EINVAL; + } + + num_qps = dev->data->nb_queue_pairs; + + if (num_qps == 0) + return 0; + + COMPRESSDEV_LOG(DEBUG, "Free %d queues pairs on device %u", + dev->data->nb_queue_pairs, dev->data->dev_id); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_release, + -ENOTSUP); + + for (i = 0; i < num_qps; i++) { + ret = (*dev->dev_ops->queue_pair_release)(dev, i); + if (ret < 0) + return ret; + } + + if (dev->data->queue_pairs != NULL) + rte_free(dev->data->queue_pairs); + dev->data->queue_pairs = NULL; + dev->data->nb_queue_pairs = 0; + + return 0; +} + +int __rte_experimental +rte_compressdev_configure(uint8_t dev_id, struct rte_compressdev_config *config) +{ + struct rte_compressdev *dev; + int diag; + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_comp_devices[dev_id]; + + if (dev->data->dev_started) { + COMPRESSDEV_LOG(ERR, + "device %d must be stopped to allow configuration", dev_id); + return -EBUSY; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP); + + /* Setup new number of queue pairs and reconfigure device. */ + diag = rte_compressdev_queue_pairs_config(dev, config->nb_queue_pairs, + config->socket_id); + if (diag != 0) { + COMPRESSDEV_LOG(ERR, + "dev%d rte_comp_dev_queue_pairs_config = %d", + dev_id, diag); + return diag; + } + + return (*dev->dev_ops->dev_configure)(dev, config); +} + +int __rte_experimental +rte_compressdev_start(uint8_t dev_id) +{ + struct rte_compressdev *dev; + int diag; + + COMPRESSDEV_LOG(DEBUG, "Start dev_id=%" PRIu8, dev_id); + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_comp_devices[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_start, -ENOTSUP); + + if (dev->data->dev_started != 0) { + COMPRESSDEV_LOG(ERR, + "Device with dev_id=%" PRIu8 " already started", dev_id); + return 0; + } + + diag = (*dev->dev_ops->dev_start)(dev); + if (diag == 0) + dev->data->dev_started = 1; + else + return diag; + + return 0; +} + +void __rte_experimental +rte_compressdev_stop(uint8_t dev_id) +{ + struct rte_compressdev *dev; + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id); + return; + } + + dev = &rte_comp_devices[dev_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_stop); + + if (dev->data->dev_started == 0) { + COMPRESSDEV_LOG(ERR, + "Device with dev_id=%" PRIu8 " already stopped", dev_id); + return; + } + + (*dev->dev_ops->dev_stop)(dev); + dev->data->dev_started = 0; +} + +int __rte_experimental +rte_compressdev_close(uint8_t dev_id) +{ + struct rte_compressdev *dev; + int retval; + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id); + return -1; + } + + dev = &rte_comp_devices[dev_id]; + + /* Device must be stopped before it can be closed */ + if (dev->data->dev_started == 1) { + COMPRESSDEV_LOG(ERR, "Device %u must be stopped before closing", + dev_id); + return -EBUSY; + } + + /* Free queue pairs memory */ + retval = rte_compressdev_queue_pairs_release(dev); + + if (retval < 0) + return retval; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP); + retval = (*dev->dev_ops->dev_close)(dev); + + if (retval < 0) + return retval; + + return 0; +} + +int __rte_experimental +rte_compressdev_queue_pair_setup(uint8_t dev_id, uint16_t queue_pair_id, + uint32_t max_inflight_ops, int socket_id) +{ + struct rte_compressdev *dev; + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_comp_devices[dev_id]; + if (queue_pair_id >= dev->data->nb_queue_pairs) { + COMPRESSDEV_LOG(ERR, "Invalid queue_pair_id=%d", queue_pair_id); + return -EINVAL; + } + + if (dev->data->dev_started) { + COMPRESSDEV_LOG(ERR, + "device %d must be stopped to allow configuration", dev_id); + return -EBUSY; + } + + if (max_inflight_ops == 0) { + COMPRESSDEV_LOG(ERR, + "Invalid maximum number of inflight operations"); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_setup, -ENOTSUP); + + return (*dev->dev_ops->queue_pair_setup)(dev, queue_pair_id, + max_inflight_ops, socket_id); +} + +uint16_t __rte_experimental +rte_compressdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_comp_op **ops, uint16_t nb_ops) +{ + struct rte_compressdev *dev = &rte_compressdevs[dev_id]; + + nb_ops = (*dev->dequeue_burst) + (dev->data->queue_pairs[qp_id], ops, nb_ops); + + return nb_ops; +} + +uint16_t __rte_experimental +rte_compressdev_enqueue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_comp_op **ops, uint16_t nb_ops) +{ + struct rte_compressdev *dev = &rte_compressdevs[dev_id]; + + return (*dev->enqueue_burst)( + dev->data->queue_pairs[qp_id], ops, nb_ops); +} + +int __rte_experimental +rte_compressdev_stats_get(uint8_t dev_id, struct rte_compressdev_stats *stats) +{ + struct rte_compressdev *dev; + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%d", dev_id); + return -ENODEV; + } + + if (stats == NULL) { + COMPRESSDEV_LOG(ERR, "Invalid stats ptr"); + return -EINVAL; + } + + dev = &rte_comp_devices[dev_id]; + memset(stats, 0, sizeof(*stats)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP); + (*dev->dev_ops->stats_get)(dev, stats); + return 0; +} + +void __rte_experimental +rte_compressdev_stats_reset(uint8_t dev_id) +{ + struct rte_compressdev *dev; + + if (!rte_compressdev_is_valid_dev(dev_id)) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%" PRIu8, dev_id); + return; + } + + dev = &rte_comp_devices[dev_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->stats_reset); + (*dev->dev_ops->stats_reset)(dev); +} + + +void __rte_experimental +rte_compressdev_info_get(uint8_t dev_id, struct rte_compressdev_info *dev_info) +{ + struct rte_compressdev *dev; + + if (dev_id >= compressdev_globals.nb_devs) { + COMPRESSDEV_LOG(ERR, "Invalid dev_id=%d", dev_id); + return; + } + + dev = &rte_comp_devices[dev_id]; + + memset(dev_info, 0, sizeof(struct rte_compressdev_info)); + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get); + (*dev->dev_ops->dev_infos_get)(dev, dev_info); + + dev_info->driver_name = dev->device->driver->name; +} + +int __rte_experimental +rte_compressdev_private_xform_create(uint8_t dev_id, + const struct rte_comp_xform *xform, + void **priv_xform) +{ + struct rte_compressdev *dev; + int ret; + + dev = rte_compressdev_get_dev(dev_id); + + if (xform == NULL || priv_xform == NULL || dev == NULL) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->private_xform_create, -ENOTSUP); + ret = (*dev->dev_ops->private_xform_create)(dev, xform, priv_xform); + if (ret < 0) { + COMPRESSDEV_LOG(ERR, + "dev_id %d failed to create private_xform: err=%d", + dev_id, ret); + return ret; + }; + + return 0; +} + +int __rte_experimental +rte_compressdev_private_xform_free(uint8_t dev_id, void *priv_xform) +{ + struct rte_compressdev *dev; + int ret; + + dev = rte_compressdev_get_dev(dev_id); + + if (dev == NULL || priv_xform == NULL) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->private_xform_free, -ENOTSUP); + ret = dev->dev_ops->private_xform_free(dev, priv_xform); + if (ret < 0) { + COMPRESSDEV_LOG(ERR, + "dev_id %d failed to free private xform: err=%d", + dev_id, ret); + return ret; + }; + + return 0; +} + +int __rte_experimental +rte_compressdev_stream_create(uint8_t dev_id, + const struct rte_comp_xform *xform, + void **stream) +{ + struct rte_compressdev *dev; + int ret; + + dev = rte_compressdev_get_dev(dev_id); + + if (xform == NULL || dev == NULL || stream == NULL) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stream_create, -ENOTSUP); + ret = (*dev->dev_ops->stream_create)(dev, xform, stream); + if (ret < 0) { + COMPRESSDEV_LOG(ERR, + "dev_id %d failed to create stream: err=%d", + dev_id, ret); + return ret; + }; + + return 0; +} + + +int __rte_experimental +rte_compressdev_stream_free(uint8_t dev_id, void *stream) +{ + struct rte_compressdev *dev; + int ret; + + dev = rte_compressdev_get_dev(dev_id); + + if (dev == NULL || stream == NULL) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stream_free, -ENOTSUP); + ret = dev->dev_ops->stream_free(dev, stream); + if (ret < 0) { + COMPRESSDEV_LOG(ERR, + "dev_id %d failed to free stream: err=%d", + dev_id, ret); + return ret; + }; + + return 0; +} + +const char * __rte_experimental +rte_compressdev_name_get(uint8_t dev_id) +{ + struct rte_compressdev *dev = rte_compressdev_get_dev(dev_id); + + if (dev == NULL) + return NULL; + + return dev->data->name; +} + +RTE_INIT(rte_compressdev_log) +{ + compressdev_logtype = rte_log_register("lib.compressdev"); + if (compressdev_logtype >= 0) + rte_log_set_level(compressdev_logtype, RTE_LOG_NOTICE); +} diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.h b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.h new file mode 100644 index 00000000..5b4fca4d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev.h @@ -0,0 +1,540 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef _RTE_COMPRESSDEV_H_ +#define _RTE_COMPRESSDEV_H_ + +/** + * @file rte_compressdev.h + * + * RTE Compression Device APIs + * + * Defines comp device APIs for the provisioning of compression operations. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "rte_comp.h" + +/** + * Parameter log base 2 range description. + * Final value will be 2^value. + */ +struct rte_param_log2_range { + uint8_t min; /**< Minimum log2 value */ + uint8_t max; /**< Maximum log2 value */ + uint8_t increment; + /**< If a range of sizes are supported, + * this parameter is used to indicate + * increments in base 2 log byte value + * that are supported between the minimum and maximum + */ +}; + +/** Structure used to capture a capability of a comp device */ +struct rte_compressdev_capabilities { + enum rte_comp_algorithm algo; + /* Compression algorithm */ + uint64_t comp_feature_flags; + /**< Bitmask of flags for compression service features */ + struct rte_param_log2_range window_size; + /**< Window size range in base two log byte values */ +}; + +/** Macro used at end of comp PMD list */ +#define RTE_COMP_END_OF_CAPABILITIES_LIST() \ + { RTE_COMP_ALGO_UNSPECIFIED } + +const struct rte_compressdev_capabilities * __rte_experimental +rte_compressdev_capability_get(uint8_t dev_id, + enum rte_comp_algorithm algo); + +/** + * compression device supported feature flags + * + * @note New features flags should be added to the end of the list + * + * Keep these flags synchronised with rte_compressdev_get_feature_name() + */ +#define RTE_COMPDEV_FF_HW_ACCELERATED (1ULL << 0) +/**< Operations are off-loaded to an external hardware accelerator */ +#define RTE_COMPDEV_FF_CPU_SSE (1ULL << 1) +/**< Utilises CPU SIMD SSE instructions */ +#define RTE_COMPDEV_FF_CPU_AVX (1ULL << 2) +/**< Utilises CPU SIMD AVX instructions */ +#define RTE_COMPDEV_FF_CPU_AVX2 (1ULL << 3) +/**< Utilises CPU SIMD AVX2 instructions */ +#define RTE_COMPDEV_FF_CPU_AVX512 (1ULL << 4) +/**< Utilises CPU SIMD AVX512 instructions */ +#define RTE_COMPDEV_FF_CPU_NEON (1ULL << 5) +/**< Utilises CPU NEON instructions */ + +/** + * Get the name of a compress device feature flag. + * + * @param flag + * The mask describing the flag + * + * @return + * The name of this flag, or NULL if it's not a valid feature flag. + */ +const char * __rte_experimental +rte_compressdev_get_feature_name(uint64_t flag); + +/** comp device information */ +struct rte_compressdev_info { + const char *driver_name; /**< Driver name. */ + uint64_t feature_flags; /**< Feature flags */ + const struct rte_compressdev_capabilities *capabilities; + /**< Array of devices supported capabilities */ + uint16_t max_nb_queue_pairs; + /**< Maximum number of queues pairs supported by device. + * (If 0, there is no limit in maximum number of queue pairs) + */ +}; + +/** comp device statistics */ +struct rte_compressdev_stats { + uint64_t enqueued_count; + /**< Count of all operations enqueued */ + uint64_t dequeued_count; + /**< Count of all operations dequeued */ + + uint64_t enqueue_err_count; + /**< Total error count on operations enqueued */ + uint64_t dequeue_err_count; + /**< Total error count on operations dequeued */ +}; + + +/** + * Get the device identifier for the named compress device. + * + * @param name + * Device name to select the device structure + * @return + * - Returns compress device identifier on success. + * - Return -1 on failure to find named compress device. + */ +int __rte_experimental +rte_compressdev_get_dev_id(const char *name); + +/** + * Get the compress device name given a device identifier. + * + * @param dev_id + * Compress device identifier + * @return + * - Returns compress device name. + * - Returns NULL if compress device is not present. + */ +const char * __rte_experimental +rte_compressdev_name_get(uint8_t dev_id); + +/** + * Get the total number of compress devices that have been successfully + * initialised. + * + * @return + * - The total number of usable compress devices. + */ +uint8_t __rte_experimental +rte_compressdev_count(void); + +/** + * Get number and identifiers of attached comp devices that + * use the same compress driver. + * + * @param driver_name + * Driver name + * @param devices + * Output devices identifiers + * @param nb_devices + * Maximal number of devices + * + * @return + * Returns number of attached compress devices. + */ +uint8_t __rte_experimental +rte_compressdev_devices_get(const char *driver_name, uint8_t *devices, + uint8_t nb_devices); + +/* + * Return the NUMA socket to which a device is connected. + * + * @param dev_id + * Compress device identifier + * @return + * The NUMA socket id to which the device is connected or + * a default of zero if the socket could not be determined. + * -1 if returned is the dev_id value is out of range. + */ +int __rte_experimental +rte_compressdev_socket_id(uint8_t dev_id); + +/** Compress device configuration structure */ +struct rte_compressdev_config { + int socket_id; + /**< Socket on which to allocate resources */ + uint16_t nb_queue_pairs; + /**< Total number of queue pairs to configure on a device */ + uint16_t max_nb_priv_xforms; + /**< Max number of private_xforms which will be created on the device */ + uint16_t max_nb_streams; + /**< Max number of streams which will be created on the device */ +}; + +/** + * Configure a device. + * + * This function must be invoked first before any other function in the + * API. This function can also be re-invoked when a device is in the + * stopped state. + * + * @param dev_id + * Compress device identifier + * @param config + * The compress device configuration + * @return + * - 0: Success, device configured. + * - <0: Error code returned by the driver configuration function. + */ +int __rte_experimental +rte_compressdev_configure(uint8_t dev_id, + struct rte_compressdev_config *config); + +/** + * Start a device. + * + * The device start step is called after configuring the device and setting up + * its queue pairs. + * On success, data-path functions exported by the API (enqueue/dequeue, etc) + * can be invoked. + * + * @param dev_id + * Compress device identifier + * @return + * - 0: Success, device started. + * - <0: Error code of the driver device start function. + */ +int __rte_experimental +rte_compressdev_start(uint8_t dev_id); + +/** + * Stop a device. The device can be restarted with a call to + * rte_compressdev_start() + * + * @param dev_id + * Compress device identifier + */ +void __rte_experimental +rte_compressdev_stop(uint8_t dev_id); + +/** + * Close an device. + * The memory allocated in the device gets freed. + * After calling this function, in order to use + * the device again, it is required to + * configure the device again. + * + * @param dev_id + * Compress device identifier + * + * @return + * - 0 on successfully closing device + * - <0 on failure to close device + */ +int __rte_experimental +rte_compressdev_close(uint8_t dev_id); + +/** + * Allocate and set up a receive queue pair for a device. + * This should only be called when the device is stopped. + * + * + * @param dev_id + * Compress device identifier + * @param queue_pair_id + * The index of the queue pairs to set up. The + * value must be in the range [0, nb_queue_pair - 1] + * previously supplied to rte_compressdev_configure() + * @param max_inflight_ops + * Max number of ops which the qp will have to + * accommodate simultaneously + * @param socket_id + * The *socket_id* argument is the socket identifier + * in case of NUMA. The value can be *SOCKET_ID_ANY* + * if there is no NUMA constraint for the DMA memory + * allocated for the receive queue pair + * @return + * - 0: Success, queue pair correctly set up. + * - <0: Queue pair configuration failed + */ +int __rte_experimental +rte_compressdev_queue_pair_setup(uint8_t dev_id, uint16_t queue_pair_id, + uint32_t max_inflight_ops, int socket_id); + +/** + * Get the number of queue pairs on a specific comp device + * + * @param dev_id + * Compress device identifier + * @return + * - The number of configured queue pairs. + */ +uint16_t __rte_experimental +rte_compressdev_queue_pair_count(uint8_t dev_id); + + +/** + * Retrieve the general I/O statistics of a device. + * + * @param dev_id + * The identifier of the device + * @param stats + * A pointer to a structure of type + * *rte_compressdev_stats* to be filled with the + * values of device counters + * @return + * - Zero if successful. + * - Non-zero otherwise. + */ +int __rte_experimental +rte_compressdev_stats_get(uint8_t dev_id, struct rte_compressdev_stats *stats); + +/** + * Reset the general I/O statistics of a device. + * + * @param dev_id + * The identifier of the device. + */ +void __rte_experimental +rte_compressdev_stats_reset(uint8_t dev_id); + +/** + * Retrieve the contextual information of a device. + * + * @param dev_id + * Compress device identifier + * @param dev_info + * A pointer to a structure of type *rte_compressdev_info* + * to be filled with the contextual information of the device + * + * @note The capabilities field of dev_info is set to point to the first + * element of an array of struct rte_compressdev_capabilities. + * The element after the last valid element has it's op field set to + * RTE_COMP_ALGO_LIST_END. + */ +void __rte_experimental +rte_compressdev_info_get(uint8_t dev_id, struct rte_compressdev_info *dev_info); + +/** + * + * Dequeue a burst of processed compression operations from a queue on the comp + * device. The dequeued operation are stored in *rte_comp_op* structures + * whose pointers are supplied in the *ops* array. + * + * The rte_compressdev_dequeue_burst() function returns the number of ops + * actually dequeued, which is the number of *rte_comp_op* data structures + * effectively supplied into the *ops* array. + * + * A return value equal to *nb_ops* indicates that the queue contained + * at least *nb_ops* operations, and this is likely to signify that other + * processed operations remain in the devices output queue. Applications + * implementing a "retrieve as many processed operations as possible" policy + * can check this specific case and keep invoking the + * rte_compressdev_dequeue_burst() function until a value less than + * *nb_ops* is returned. + * + * The rte_compressdev_dequeue_burst() function does not provide any error + * notification to avoid the corresponding overhead. + * + * @note: operation ordering is not maintained within the queue pair. + * + * @note: In case op status = OUT_OF_SPACE_TERMINATED, op.consumed=0 and the + * op must be resubmitted with the same input data and a larger output buffer. + * op.produced is usually 0, but in decompression cases a PMD may return > 0 + * and the application may find it useful to inspect that data. + * This status is only returned on STATELESS ops. + * + * @note: In case op status = OUT_OF_SPACE_RECOVERABLE, op.produced can be used + * and next op in stream should continue on from op.consumed+1 with a fresh + * output buffer. + * Consumed=0, produced=0 is an unusual but allowed case. There may be useful + * state/history stored in the PMD, even though no output was produced yet. + * + * + * @param dev_id + * Compress device identifier + * @param qp_id + * The index of the queue pair from which to retrieve + * processed operations. The value must be in the range + * [0, nb_queue_pair - 1] previously supplied to + * rte_compressdev_configure() + * @param ops + * The address of an array of pointers to + * *rte_comp_op* structures that must be + * large enough to store *nb_ops* pointers in it + * @param nb_ops + * The maximum number of operations to dequeue + * @return + * - The number of operations actually dequeued, which is the number + * of pointers to *rte_comp_op* structures effectively supplied to the + * *ops* array. + */ +uint16_t __rte_experimental +rte_compressdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_comp_op **ops, uint16_t nb_ops); + +/** + * Enqueue a burst of operations for processing on a compression device. + * + * The rte_compressdev_enqueue_burst() function is invoked to place + * comp operations on the queue *qp_id* of the device designated by + * its *dev_id*. + * + * The *nb_ops* parameter is the number of operations to process which are + * supplied in the *ops* array of *rte_comp_op* structures. + * + * The rte_compressdev_enqueue_burst() function returns the number of + * operations it actually enqueued for processing. A return value equal to + * *nb_ops* means that all packets have been enqueued. + * + * @note All compression operations are Out-of-place (OOP) operations, + * as the size of the output data is different to the size of the input data. + * + * @note The flush flag only applies to operations which return SUCCESS. + * In OUT_OF_SPACE cases whether STATEFUL or STATELESS, data in dest buffer + * is as if flush flag was FLUSH_NONE. + * @note flush flag only applies in compression direction. It has no meaning + * for decompression. + * @note: operation ordering is not maintained within the queue pair. + * + * @param dev_id + * Compress device identifier + * @param qp_id + * The index of the queue pair on which operations + * are to be enqueued for processing. The value + * must be in the range [0, nb_queue_pairs - 1] + * previously supplied to *rte_compressdev_configure* + * @param ops + * The address of an array of *nb_ops* pointers + * to *rte_comp_op* structures which contain + * the operations to be processed + * @param nb_ops + * The number of operations to process + * @return + * The number of operations actually enqueued on the device. The return + * value can be less than the value of the *nb_ops* parameter when the + * comp devices queue is full or if invalid parameters are specified in + * a *rte_comp_op*. + */ +uint16_t __rte_experimental +rte_compressdev_enqueue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_comp_op **ops, uint16_t nb_ops); + +/** + * This should alloc a stream from the device's mempool and initialise it. + * The application should call this API when setting up for the stateful + * processing of a set of data on a device. The API can be called multiple + * times to set up a stream for each data set. The handle returned is only for + * use with ops of op_type STATEFUL and must be passed to the PMD + * with every op in the data stream + * + * @param dev_id + * Compress device identifier + * @param xform + * xform data + * @param stream + * Pointer to where PMD's private stream handle should be stored + * + * @return + * - 0 if successful and valid stream handle + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if comp device does not support STATEFUL operations. + * - Returns -ENOTSUP if comp device does not support the comp transform. + * - Returns -ENOMEM if the private stream could not be allocated. + * + */ +int __rte_experimental +rte_compressdev_stream_create(uint8_t dev_id, + const struct rte_comp_xform *xform, + void **stream); + +/** + * This should clear the stream and return it to the device's mempool. + * + * @param dev_id + * Compress device identifier + * + * @param stream + * PMD's private stream data + * + * @return + * - 0 if successful + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if comp device does not support STATEFUL operations. + * - Returns -EBUSY if can't free stream as there are inflight operations + */ +int __rte_experimental +rte_compressdev_stream_free(uint8_t dev_id, void *stream); + +/** + * This should alloc a private_xform from the device's mempool and initialise + * it. The application should call this API when setting up for stateless + * processing on a device. If it returns non-shareable, then the appl cannot + * share this handle with multiple in-flight ops and should call this API again + * to get a separate handle for every in-flight op. + * The handle returned is only valid for use with ops of op_type STATELESS. + * + * @param dev_id + * Compress device identifier + * @param xform + * xform data + * @param private_xform + * Pointer to where PMD's private_xform handle should be stored + * + * @return + * - if successful returns 0 + * and valid private_xform handle + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if comp device does not support the comp transform. + * - Returns -ENOMEM if the private_xform could not be allocated. + */ +int __rte_experimental +rte_compressdev_private_xform_create(uint8_t dev_id, + const struct rte_comp_xform *xform, + void **private_xform); + +/** + * This should clear the private_xform and return it to the device's mempool. + * It is the application's responsibility to ensure that private_xform data + * is not cleared while there are still in-flight operations using it. + * + * @param dev_id + * Compress device identifier + * + * @param private_xform + * PMD's private_xform data + * + * @return + * - 0 if successful + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + */ +int __rte_experimental +rte_compressdev_private_xform_free(uint8_t dev_id, void *private_xform); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_COMPRESSDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_internal.h b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_internal.h new file mode 100644 index 00000000..22ceac66 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_internal.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef _RTE_COMPRESSDEV_INTERNAL_H_ +#define _RTE_COMPRESSDEV_INTERNAL_H_ + +/* rte_compressdev_internal.h + * This file holds Compressdev private data structures. + */ +#include + +#include "rte_comp.h" + +#define RTE_COMPRESSDEV_NAME_MAX_LEN (64) +/**< Max length of name of comp PMD */ + +/* Logging Macros */ +extern int compressdev_logtype; +#define COMPRESSDEV_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ## level, compressdev_logtype, "%s(): "fmt "\n", \ + __func__, ##args) + +/** + * Dequeue processed packets from queue pair of a device. + * + * @param qp + * The queue pair from which to retrieve + * processed operations. + * @param ops + * The address of an array of pointers to + * *rte_comp_op* structures that must be + * large enough to store *nb_ops* pointers in it + * @param nb_ops + * The maximum number of operations to dequeue + * @return + * - The number of operations actually dequeued, which is the number + * of pointers to *rte_comp_op* structures effectively supplied to the + * *ops* array. + */ +typedef uint16_t (*compressdev_dequeue_pkt_burst_t)(void *qp, + struct rte_comp_op **ops, uint16_t nb_ops); + +/** + * Enqueue a burst of operations for processing. + * + * @param qp + * The queue pair on which operations + * are to be enqueued for processing + * @param ops + * The address of an array of *nb_ops* pointers + * to *rte_comp_op* structures which contain + * the operations to be processed + * @param nb_ops + * The number of operations to process + * @return + * The number of operations actually enqueued on the device. The return + * value can be less than the value of the *nb_ops* parameter when the + * comp devices queue is full or if invalid parameters are specified in + * a *rte_comp_op*. + */ + +typedef uint16_t (*compressdev_enqueue_pkt_burst_t)(void *qp, + struct rte_comp_op **ops, uint16_t nb_ops); + +/** The data structure associated with each comp device. */ +struct rte_compressdev { + compressdev_dequeue_pkt_burst_t dequeue_burst; + /**< Pointer to PMD receive function */ + compressdev_enqueue_pkt_burst_t enqueue_burst; + /**< Pointer to PMD transmit function */ + + struct rte_compressdev_data *data; + /**< Pointer to device data */ + struct rte_compressdev_ops *dev_ops; + /**< Functions exported by PMD */ + uint64_t feature_flags; + /**< Supported features */ + struct rte_device *device; + /**< Backing device */ + + __extension__ + uint8_t attached : 1; + /**< Flag indicating the device is attached */ +} __rte_cache_aligned; + +/** + * + * The data part, with no function pointers, associated with each device. + * + * This structure is safe to place in shared memory to be common among + * different processes in a multi-process configuration. + */ +struct rte_compressdev_data { + uint8_t dev_id; + /**< Compress device identifier */ + uint8_t socket_id; + /**< Socket identifier where memory is allocated */ + char name[RTE_COMPRESSDEV_NAME_MAX_LEN]; + /**< Unique identifier name */ + + __extension__ + uint8_t dev_started : 1; + /**< Device state: STARTED(1)/STOPPED(0) */ + + void **queue_pairs; + /**< Array of pointers to queue pairs. */ + uint16_t nb_queue_pairs; + /**< Number of device queue pairs */ + + void *dev_private; + /**< PMD-specific private data */ +} __rte_cache_aligned; +#endif diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.c b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.c new file mode 100644 index 00000000..7de4f339 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.c @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include +#include +#include + +#include "rte_compressdev_internal.h" +#include "rte_compressdev_pmd.h" + +int compressdev_logtype; + +/** + * Parse name from argument + */ +static int +rte_compressdev_pmd_parse_name_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + struct rte_compressdev_pmd_init_params *params = extra_args; + int n; + + n = snprintf(params->name, RTE_COMPRESSDEV_NAME_MAX_LEN, "%s", value); + if (n >= RTE_COMPRESSDEV_NAME_MAX_LEN) + return -EINVAL; + + return 0; +} + +/** + * Parse unsigned integer from argument + */ +static int +rte_compressdev_pmd_parse_uint_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + int i; + char *end; + + errno = 0; + i = strtol(value, &end, 10); + if (*end != 0 || errno != 0 || i < 0) + return -EINVAL; + + *((uint32_t *)extra_args) = i; + return 0; +} + +int __rte_experimental +rte_compressdev_pmd_parse_input_args( + struct rte_compressdev_pmd_init_params *params, + const char *args) +{ + struct rte_kvargs *kvlist = NULL; + int ret = 0; + + if (params == NULL) + return -EINVAL; + + if (args) { + kvlist = rte_kvargs_parse(args, compressdev_pmd_valid_params); + if (kvlist == NULL) + return -EINVAL; + + ret = rte_kvargs_process(kvlist, + RTE_COMPRESSDEV_PMD_SOCKET_ID_ARG, + &rte_compressdev_pmd_parse_uint_arg, + ¶ms->socket_id); + if (ret < 0) + goto free_kvlist; + + ret = rte_kvargs_process(kvlist, + RTE_COMPRESSDEV_PMD_NAME_ARG, + &rte_compressdev_pmd_parse_name_arg, + params); + if (ret < 0) + goto free_kvlist; + } + +free_kvlist: + rte_kvargs_free(kvlist); + return ret; +} + +struct rte_compressdev * __rte_experimental +rte_compressdev_pmd_create(const char *name, + struct rte_device *device, + size_t private_data_size, + struct rte_compressdev_pmd_init_params *params) +{ + struct rte_compressdev *compressdev; + + if (params->name[0] != '\0') { + COMPRESSDEV_LOG(INFO, "[%s] User specified device name = %s\n", + device->driver->name, params->name); + name = params->name; + } + + COMPRESSDEV_LOG(INFO, "[%s] - Creating compressdev %s\n", + device->driver->name, name); + + COMPRESSDEV_LOG(INFO, + "[%s] - Init parameters - name: %s, socket id: %d", + device->driver->name, name, + params->socket_id); + + /* allocate device structure */ + compressdev = rte_compressdev_pmd_allocate(name, params->socket_id); + if (compressdev == NULL) { + COMPRESSDEV_LOG(ERR, "[%s] Failed to allocate comp device %s", + device->driver->name, name); + return NULL; + } + + /* allocate private device structure */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + compressdev->data->dev_private = + rte_zmalloc_socket("compressdev device private", + private_data_size, + RTE_CACHE_LINE_SIZE, + params->socket_id); + + if (compressdev->data->dev_private == NULL) { + COMPRESSDEV_LOG(ERR, + "[%s] Cannot allocate memory for compressdev %s private data", + device->driver->name, name); + + rte_compressdev_pmd_release_device(compressdev); + return NULL; + } + } + + compressdev->device = device; + + return compressdev; +} + +int __rte_experimental +rte_compressdev_pmd_destroy(struct rte_compressdev *compressdev) +{ + int retval; + + COMPRESSDEV_LOG(INFO, "[%s] Closing comp device %s", + compressdev->device->driver->name, + compressdev->device->name); + + /* free comp device */ + retval = rte_compressdev_pmd_release_device(compressdev); + if (retval) + return retval; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_free(compressdev->data->dev_private); + + compressdev->device = NULL; + compressdev->data = NULL; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.h b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.h new file mode 100644 index 00000000..38e9ea02 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_pmd.h @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef _RTE_COMPRESSDEV_PMD_H_ +#define _RTE_COMPRESSDEV_PMD_H_ + +/** @file + * RTE comp PMD APIs + * + * @note + * These APIs are for comp PMDs only and user applications should not call + * them directly. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include + +#include "rte_compressdev.h" +#include "rte_compressdev_internal.h" + +#define RTE_COMPRESSDEV_PMD_NAME_ARG ("name") +#define RTE_COMPRESSDEV_PMD_SOCKET_ID_ARG ("socket_id") + +static const char * const compressdev_pmd_valid_params[] = { + RTE_COMPRESSDEV_PMD_NAME_ARG, + RTE_COMPRESSDEV_PMD_SOCKET_ID_ARG +}; + +/** + * @internal + * Initialisation parameters for comp devices + */ +struct rte_compressdev_pmd_init_params { + char name[RTE_COMPRESSDEV_NAME_MAX_LEN]; + int socket_id; +}; + +/** Global structure used for maintaining state of allocated comp devices */ +struct rte_compressdev_global { + struct rte_compressdev *devs; /**< Device information array */ + struct rte_compressdev_data *data[RTE_COMPRESS_MAX_DEVS]; + /**< Device private data */ + uint8_t nb_devs; /**< Number of devices found */ + uint8_t max_devs; /**< Max number of devices */ +}; + +/** Pointer to global array of comp devices */ +extern struct rte_compressdev *rte_compressdevs; +/** Pointer to global comp devices data structure */ +extern struct rte_compressdev_global *rte_compressdev_globals; + +/** + * Get the rte_compressdev structure device pointer for the named device. + * + * @param name + * Compress device name + * @return + * - The rte_compressdev structure pointer for the given device identifier. + */ +struct rte_compressdev * __rte_experimental +rte_compressdev_pmd_get_named_dev(const char *name); + +/** + * Definitions of all functions exported by a driver through the + * the generic structure of type *comp_dev_ops* supplied in the + * *rte_compressdev* structure associated with a device. + */ + +/** + * Function used to configure device. + * + * @param dev + * Compress device + * @param config + * Compress device configurations + * @return + * Returns 0 on success + */ +typedef int (*compressdev_configure_t)(struct rte_compressdev *dev, + struct rte_compressdev_config *config); + +/** + * Function used to start a configured device. + * + * @param dev + * Compress device + * @return + * Returns 0 on success + */ +typedef int (*compressdev_start_t)(struct rte_compressdev *dev); + +/** + * Function used to stop a configured device. + * + * @param dev + * Compress device + */ +typedef void (*compressdev_stop_t)(struct rte_compressdev *dev); + +/** + * Function used to close a configured device. + * + * @param dev + * Compress device + * @return + * - 0 on success. + * - EAGAIN if can't close as device is busy + */ +typedef int (*compressdev_close_t)(struct rte_compressdev *dev); + + +/** + * Function used to get statistics of a device. + * + * @param dev + * Compress device + * @param stats + * Compress device stats to populate + */ +typedef void (*compressdev_stats_get_t)(struct rte_compressdev *dev, + struct rte_compressdev_stats *stats); + + +/** + * Function used to reset statistics of a device. + * + * @param dev + * Compress device + */ +typedef void (*compressdev_stats_reset_t)(struct rte_compressdev *dev); + + +/** + * Function used to get specific information of a device. + * + * @param dev + * Compress device + */ +typedef void (*compressdev_info_get_t)(struct rte_compressdev *dev, + struct rte_compressdev_info *dev_info); + +/** + * Setup a queue pair for a device. + * + * @param dev + * Compress device + * @param qp_id + * Queue pair identifier + * @param max_inflight_ops + * Max inflight ops which qp must accommodate + * @param socket_id + * Socket identifier + * @return + * Returns 0 on success. + */ +typedef int (*compressdev_queue_pair_setup_t)(struct rte_compressdev *dev, + uint16_t qp_id, uint32_t max_inflight_ops, int socket_id); + +/** + * Release memory resources allocated by given queue pair. + * + * @param dev + * Compress device + * @param qp_id + * Queue pair identifier + * @return + * - 0 on success. + * - EAGAIN if can't close as device is busy + */ +typedef int (*compressdev_queue_pair_release_t)(struct rte_compressdev *dev, + uint16_t qp_id); + +/** + * Get number of available queue pairs of a device. + * + * @param dev + * Compress device + * @return + * Returns number of queue pairs on success. + */ +typedef uint32_t (*compressdev_queue_pair_count_t)(struct rte_compressdev *dev); + +/** + * Create driver private stream data. + * + * @param dev + * Compressdev device + * @param xform + * xform data + * @param stream + * ptr where handle of pmd's private stream data should be stored + * @return + * - Returns 0 if private stream structure has been created successfully. + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if comp device does not support STATEFUL operations. + * - Returns -ENOTSUP if comp device does not support the comp transform. + * - Returns -ENOMEM if the private stream could not be allocated. + */ +typedef int (*compressdev_stream_create_t)(struct rte_compressdev *dev, + const struct rte_comp_xform *xform, void **stream); + +/** + * Free driver private stream data. + * + * @param dev + * Compressdev device + * @param stream + * handle of pmd's private stream data + * @return + * - 0 if successful + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if comp device does not support STATEFUL operations. + * - Returns -EBUSY if can't free stream as there are inflight operations + */ +typedef int (*compressdev_stream_free_t)(struct rte_compressdev *dev, + void *stream); + +/** + * Create driver private_xform data. + * + * @param dev + * Compressdev device + * @param xform + * xform data + * @param private_xform + * ptr where handle of pmd's private_xform data should be stored + * @return + * - if successful returns 0 + * and valid private_xform handle + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if comp device does not support the comp transform. + * - Returns -ENOMEM if the private_xform could not be allocated. + */ +typedef int (*compressdev_private_xform_create_t)(struct rte_compressdev *dev, + const struct rte_comp_xform *xform, void **private_xform); + +/** + * Free driver private_xform data. + * + * @param dev + * Compressdev device + * @param private_xform + * handle of pmd's private_xform data + * @return + * - 0 if successful + * - <0 in error cases + * - Returns -EINVAL if input parameters are invalid. + * - Returns -EBUSY if can't free private_xform due to inflight operations + */ +typedef int (*compressdev_private_xform_free_t)(struct rte_compressdev *dev, + void *private_xform); + +/** comp device operations function pointer table */ +struct rte_compressdev_ops { + compressdev_configure_t dev_configure; /**< Configure device. */ + compressdev_start_t dev_start; /**< Start device. */ + compressdev_stop_t dev_stop; /**< Stop device. */ + compressdev_close_t dev_close; /**< Close device. */ + + compressdev_info_get_t dev_infos_get; /**< Get device info. */ + + compressdev_stats_get_t stats_get; + /**< Get device statistics. */ + compressdev_stats_reset_t stats_reset; + /**< Reset device statistics. */ + + compressdev_queue_pair_setup_t queue_pair_setup; + /**< Set up a device queue pair. */ + compressdev_queue_pair_release_t queue_pair_release; + /**< Release a queue pair. */ + + compressdev_stream_create_t stream_create; + /**< Create a comp stream and initialise its private data. */ + compressdev_stream_free_t stream_free; + /**< Free a comp stream's private data. */ + + compressdev_private_xform_create_t private_xform_create; + /**< Create a comp private_xform and initialise its private data. */ + compressdev_private_xform_free_t private_xform_free; + /**< Free a comp private_xform's data. */ +}; + +/** + * @internal + * + * Function for internal use by dummy drivers primarily, e.g. ring-based + * driver. + * Allocates a new compressdev slot for an comp device and returns the pointer + * to that slot for the driver to use. + * + * @param name + * Unique identifier name for each device + * @param socket_id + * Socket to allocate resources on + * @return + * - Slot in the rte_dev_devices array for a new device; + */ +struct rte_compressdev * __rte_experimental +rte_compressdev_pmd_allocate(const char *name, int socket_id); + +/** + * @internal + * + * Function for internal use by dummy drivers primarily, e.g. ring-based + * driver. + * Release the specified compressdev device. + * + * @param dev + * Compress device + * @return + * - 0 on success, negative on error + */ +int __rte_experimental +rte_compressdev_pmd_release_device(struct rte_compressdev *dev); + + +/** + * @internal + * + * PMD assist function to parse initialisation arguments for comp driver + * when creating a new comp PMD device instance. + * + * PMD driver should set default values for that PMD before calling function, + * these default values will be over-written with successfully parsed values + * from args string. + * + * @param params + * Parsed PMD initialisation parameters + * @param args + * Input argument string to parse + * @return + * - 0 on success + * - errno on failure + */ +int __rte_experimental +rte_compressdev_pmd_parse_input_args( + struct rte_compressdev_pmd_init_params *params, + const char *args); + +/** + * @internal + * + * PMD assist function to provide boiler plate code for comp driver to create + * and allocate resources for a new comp PMD device instance. + * + * @param name + * Compress device name + * @param device + * Base device instance + * @param params + * PMD initialisation parameters + * @return + * - comp device instance on success + * - NULL on creation failure + */ +struct rte_compressdev * __rte_experimental +rte_compressdev_pmd_create(const char *name, + struct rte_device *device, + size_t private_data_size, + struct rte_compressdev_pmd_init_params *params); + +/** + * @internal + * + * PMD assist function to provide boiler plate code for comp driver to + * destroy and free resources associated with a comp PMD device instance. + * + * @param dev + * Compress device + * @return + * - 0 on success + * - errno on failure + */ +int __rte_experimental +rte_compressdev_pmd_destroy(struct rte_compressdev *dev); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_COMPRESSDEV_PMD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_version.map b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_version.map new file mode 100644 index 00000000..6f900b67 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_compressdev/rte_compressdev_version.map @@ -0,0 +1,39 @@ +EXPERIMENTAL { + global: + + rte_compressdev_capability_get; + rte_compressdev_close; + rte_compressdev_configure; + rte_compressdev_count; + rte_compressdev_dequeue_burst; + rte_compressdev_devices_get; + rte_compressdev_enqueue_burst; + rte_compressdev_get_dev_id; + rte_compressdev_get_feature_name; + rte_compressdev_info_get; + rte_compressdev_name_get; + rte_compressdev_pmd_allocate; + rte_compressdev_pmd_create; + rte_compressdev_pmd_destroy; + rte_compressdev_pmd_get_named_dev; + rte_compressdev_pmd_parse_input_args; + rte_compressdev_pmd_release_device; + rte_compressdev_private_xform_create; + rte_compressdev_private_xform_free; + rte_compressdev_queue_pair_count; + rte_compressdev_queue_pair_setup; + rte_compressdev_socket_id; + rte_compressdev_start; + rte_compressdev_stats_get; + rte_compressdev_stats_reset; + rte_compressdev_stop; + rte_compressdev_stream_create; + rte_compressdev_stream_free; + rte_comp_get_feature_name; + rte_comp_op_alloc; + rte_comp_op_bulk_alloc; + rte_comp_op_free; + rte_comp_op_pool_create; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_cryptodev/Makefile b/src/spdk/dpdk/lib/librte_cryptodev/Makefile new file mode 100644 index 00000000..c1148887 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2015 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_cryptodev.a + +# library version +LIBABIVER := 4 + +# build flags +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf +LDLIBS += -lrte_kvargs + +# library source files +SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c + +# export include files +SYMLINK-y-include += rte_crypto.h +SYMLINK-y-include += rte_crypto_sym.h +SYMLINK-y-include += rte_cryptodev.h +SYMLINK-y-include += rte_cryptodev_pmd.h +SYMLINK-y-include += rte_crypto_asym.h + +# versioning export map +EXPORT_MAP := rte_cryptodev_version.map + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_cryptodev/meson.build b/src/spdk/dpdk/lib/librte_cryptodev/meson.build new file mode 100644 index 00000000..295f509e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 4 +sources = files('rte_cryptodev.c', 'rte_cryptodev_pmd.c') +headers = files('rte_cryptodev.h', + 'rte_cryptodev_pmd.h', + 'rte_crypto.h', + 'rte_crypto_sym.h', + 'rte_crypto_asym.h') +deps += ['kvargs', 'mbuf'] diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto.h b/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto.h new file mode 100644 index 00000000..fd5ef3a8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto.h @@ -0,0 +1,445 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Intel Corporation + */ + +#ifndef _RTE_CRYPTO_H_ +#define _RTE_CRYPTO_H_ + +/** + * @file rte_crypto.h + * + * RTE Cryptography Common Definitions + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include +#include +#include + +#include "rte_crypto_sym.h" +#include "rte_crypto_asym.h" + +/** Crypto operation types */ +enum rte_crypto_op_type { + RTE_CRYPTO_OP_TYPE_UNDEFINED, + /**< Undefined operation type */ + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + /**< Symmetric operation */ + RTE_CRYPTO_OP_TYPE_ASYMMETRIC + /**< Asymmetric operation */ +}; + +/** Status of crypto operation */ +enum rte_crypto_op_status { + RTE_CRYPTO_OP_STATUS_SUCCESS, + /**< Operation completed successfully */ + RTE_CRYPTO_OP_STATUS_NOT_PROCESSED, + /**< Operation has not yet been processed by a crypto device */ + RTE_CRYPTO_OP_STATUS_AUTH_FAILED, + /**< Authentication verification failed */ + RTE_CRYPTO_OP_STATUS_INVALID_SESSION, + /**< + * Symmetric operation failed due to invalid session arguments, or if + * in session-less mode, failed to allocate private operation material. + */ + RTE_CRYPTO_OP_STATUS_INVALID_ARGS, + /**< Operation failed due to invalid arguments in request */ + RTE_CRYPTO_OP_STATUS_ERROR, + /**< Error handling operation */ +}; + +/** + * Crypto operation session type. This is used to specify whether a crypto + * operation has session structure attached for immutable parameters or if all + * operation information is included in the operation data structure. + */ +enum rte_crypto_op_sess_type { + RTE_CRYPTO_OP_WITH_SESSION, /**< Session based crypto operation */ + RTE_CRYPTO_OP_SESSIONLESS, /**< Session-less crypto operation */ + RTE_CRYPTO_OP_SECURITY_SESSION /**< Security session crypto operation */ +}; + +/** + * Cryptographic Operation. + * + * This structure contains data relating to performing cryptographic + * operations. This operation structure is used to contain any operation which + * is supported by the cryptodev API, PMDs should check the type parameter to + * verify that the operation is a support function of the device. Crypto + * operations are enqueued and dequeued in crypto PMDs using the + * rte_cryptodev_enqueue_burst() / rte_cryptodev_dequeue_burst() . + */ +struct rte_crypto_op { + __extension__ + union { + uint64_t raw; + __extension__ + struct { + uint8_t type; + /**< operation type */ + uint8_t status; + /**< + * operation status - this is reset to + * RTE_CRYPTO_OP_STATUS_NOT_PROCESSED on allocation + * from mempool and will be set to + * RTE_CRYPTO_OP_STATUS_SUCCESS after crypto operation + * is successfully processed by a crypto PMD + */ + uint8_t sess_type; + /**< operation session type */ + uint8_t reserved[3]; + /**< Reserved bytes to fill 64 bits for + * future additions + */ + uint16_t private_data_offset; + /**< Offset to indicate start of private data (if any). + * The offset is counted from the start of the + * rte_crypto_op including IV. + * The private data may be used by the application + * to store information which should remain untouched + * in the library/driver + */ + }; + }; + struct rte_mempool *mempool; + /**< crypto operation mempool which operation is allocated from */ + + rte_iova_t phys_addr; + /**< physical address of crypto operation */ + + __extension__ + union { + struct rte_crypto_sym_op sym[0]; + /**< Symmetric operation parameters */ + + struct rte_crypto_asym_op asym[0]; + /**< Asymmetric operation parameters */ + + }; /**< operation specific parameters */ +}; + +/** + * Reset the fields of a crypto operation to their default values. + * + * @param op The crypto operation to be reset. + * @param type The crypto operation type. + */ +static inline void +__rte_crypto_op_reset(struct rte_crypto_op *op, enum rte_crypto_op_type type) +{ + op->type = type; + op->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED; + op->sess_type = RTE_CRYPTO_OP_SESSIONLESS; + + switch (type) { + case RTE_CRYPTO_OP_TYPE_SYMMETRIC: + __rte_crypto_sym_op_reset(op->sym); + break; + case RTE_CRYPTO_OP_TYPE_ASYMMETRIC: + memset(op->asym, 0, sizeof(struct rte_crypto_asym_op)); + break; + case RTE_CRYPTO_OP_TYPE_UNDEFINED: + default: + break; + } +} + +/** + * Private data structure belonging to a crypto symmetric operation pool. + */ +struct rte_crypto_op_pool_private { + enum rte_crypto_op_type type; + /**< Crypto op pool type operation. */ + uint16_t priv_size; + /**< Size of private area in each crypto operation. */ +}; + + +/** + * Returns the size of private data allocated with each rte_crypto_op object by + * the mempool + * + * @param mempool rte_crypto_op mempool + * + * @return private data size + */ +static inline uint16_t +__rte_crypto_op_get_priv_data_size(struct rte_mempool *mempool) +{ + struct rte_crypto_op_pool_private *priv = + (struct rte_crypto_op_pool_private *) rte_mempool_get_priv(mempool); + + return priv->priv_size; +} + + +/** + * Creates a crypto operation pool + * + * @param name pool name + * @param type crypto operation type, use + * RTE_CRYPTO_OP_TYPE_UNDEFINED for a pool which + * supports all operation types + * @param nb_elts number of elements in pool + * @param cache_size Number of elements to cache on lcore, see + * *rte_mempool_create* for further details about + * cache size + * @param priv_size Size of private data to allocate with each + * operation + * @param socket_id Socket to allocate memory on + * + * @return + * - On success pointer to mempool + * - On failure NULL + */ +extern struct rte_mempool * +rte_crypto_op_pool_create(const char *name, enum rte_crypto_op_type type, + unsigned nb_elts, unsigned cache_size, uint16_t priv_size, + int socket_id); + +/** + * Bulk allocate raw element from mempool and return as crypto operations + * + * @param mempool crypto operation mempool. + * @param type crypto operation type. + * @param ops Array to place allocated crypto operations + * @param nb_ops Number of crypto operations to allocate + * + * @returns + * - On success returns number of ops allocated + */ +static inline int +__rte_crypto_op_raw_bulk_alloc(struct rte_mempool *mempool, + enum rte_crypto_op_type type, + struct rte_crypto_op **ops, uint16_t nb_ops) +{ + struct rte_crypto_op_pool_private *priv; + + priv = (struct rte_crypto_op_pool_private *) rte_mempool_get_priv(mempool); + if (unlikely(priv->type != type && + priv->type != RTE_CRYPTO_OP_TYPE_UNDEFINED)) + return -EINVAL; + + if (rte_mempool_get_bulk(mempool, (void **)ops, nb_ops) == 0) + return nb_ops; + + return 0; +} + +/** + * Allocate a crypto operation from a mempool with default parameters set + * + * @param mempool crypto operation mempool + * @param type operation type to allocate + * + * @returns + * - On success returns a valid rte_crypto_op structure + * - On failure returns NULL + */ +static inline struct rte_crypto_op * +rte_crypto_op_alloc(struct rte_mempool *mempool, enum rte_crypto_op_type type) +{ + struct rte_crypto_op *op = NULL; + int retval; + + retval = __rte_crypto_op_raw_bulk_alloc(mempool, type, &op, 1); + if (unlikely(retval != 1)) + return NULL; + + __rte_crypto_op_reset(op, type); + + return op; +} + + +/** + * Bulk allocate crypto operations from a mempool with default parameters set + * + * @param mempool crypto operation mempool + * @param type operation type to allocate + * @param ops Array to place allocated crypto operations + * @param nb_ops Number of crypto operations to allocate + * + * @returns + * - nb_ops if the number of operations requested were allocated. + * - 0 if the requested number of ops are not available. + * None are allocated in this case. + */ + +static inline unsigned +rte_crypto_op_bulk_alloc(struct rte_mempool *mempool, + enum rte_crypto_op_type type, + struct rte_crypto_op **ops, uint16_t nb_ops) +{ + int i; + + if (unlikely(__rte_crypto_op_raw_bulk_alloc(mempool, type, ops, nb_ops) + != nb_ops)) + return 0; + + for (i = 0; i < nb_ops; i++) + __rte_crypto_op_reset(ops[i], type); + + return nb_ops; +} + + + +/** + * Returns a pointer to the private data of a crypto operation if + * that operation has enough capacity for requested size. + * + * @param op crypto operation. + * @param size size of space requested in private data. + * + * @returns + * - if sufficient space available returns pointer to start of private data + * - if insufficient space returns NULL + */ +static inline void * +__rte_crypto_op_get_priv_data(struct rte_crypto_op *op, uint32_t size) +{ + uint32_t priv_size; + + if (likely(op->mempool != NULL)) { + priv_size = __rte_crypto_op_get_priv_data_size(op->mempool); + + if (likely(priv_size >= size)) { + if (op->type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) + return (void *)((uint8_t *)(op + 1) + + sizeof(struct rte_crypto_sym_op)); + if (op->type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) + return (void *)((uint8_t *)(op + 1) + + sizeof(struct rte_crypto_asym_op)); + } + } + + return NULL; +} + +/** + * free crypto operation structure + * If operation has been allocate from a rte_mempool, then the operation will + * be returned to the mempool. + * + * @param op symmetric crypto operation + */ +static inline void +rte_crypto_op_free(struct rte_crypto_op *op) +{ + if (op != NULL && op->mempool != NULL) + rte_mempool_put(op->mempool, op); +} + +/** + * Allocate a symmetric crypto operation in the private data of an mbuf. + * + * @param m mbuf which is associated with the crypto operation, the + * operation will be allocated in the private data of that + * mbuf. + * + * @returns + * - On success returns a pointer to the crypto operation. + * - On failure returns NULL. + */ +static inline struct rte_crypto_op * +rte_crypto_sym_op_alloc_from_mbuf_priv_data(struct rte_mbuf *m) +{ + if (unlikely(m == NULL)) + return NULL; + + /* + * check that the mbuf's private data size is sufficient to contain a + * crypto operation + */ + if (unlikely(m->priv_size < (sizeof(struct rte_crypto_op) + + sizeof(struct rte_crypto_sym_op)))) + return NULL; + + /* private data starts immediately after the mbuf header in the mbuf. */ + struct rte_crypto_op *op = (struct rte_crypto_op *)(m + 1); + + __rte_crypto_op_reset(op, RTE_CRYPTO_OP_TYPE_SYMMETRIC); + + op->mempool = NULL; + op->sym->m_src = m; + + return op; +} + +/** + * Allocate space for symmetric crypto xforms in the private data space of the + * crypto operation. This also defaults the crypto xform type and configures + * the chaining of the xforms in the crypto operation + * + * @return + * - On success returns pointer to first crypto xform in crypto operations chain + * - On failure returns NULL + */ +static inline struct rte_crypto_sym_xform * +rte_crypto_op_sym_xforms_alloc(struct rte_crypto_op *op, uint8_t nb_xforms) +{ + void *priv_data; + uint32_t size; + + if (unlikely(op->type != RTE_CRYPTO_OP_TYPE_SYMMETRIC)) + return NULL; + + size = sizeof(struct rte_crypto_sym_xform) * nb_xforms; + + priv_data = __rte_crypto_op_get_priv_data(op, size); + if (priv_data == NULL) + return NULL; + + return __rte_crypto_sym_op_sym_xforms_alloc(op->sym, priv_data, + nb_xforms); +} + + +/** + * Attach a session to a crypto operation + * + * @param op crypto operation, must be of type symmetric + * @param sess cryptodev session + */ +static inline int +rte_crypto_op_attach_sym_session(struct rte_crypto_op *op, + struct rte_cryptodev_sym_session *sess) +{ + if (unlikely(op->type != RTE_CRYPTO_OP_TYPE_SYMMETRIC)) + return -1; + + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + + return __rte_crypto_sym_op_attach_sym_session(op->sym, sess); +} + +/** + * Attach a asymmetric session to a crypto operation + * + * @param op crypto operation, must be of type asymmetric + * @param sess cryptodev session + */ +static inline int +rte_crypto_op_attach_asym_session(struct rte_crypto_op *op, + struct rte_cryptodev_asym_session *sess) +{ + if (unlikely(op->type != RTE_CRYPTO_OP_TYPE_ASYMMETRIC)) + return -1; + + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + op->asym->session = sess; + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRYPTO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_asym.h b/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_asym.h new file mode 100644 index 00000000..5e185b2d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_asym.h @@ -0,0 +1,496 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Cavium Networks + */ + +#ifndef _RTE_CRYPTO_ASYM_H_ +#define _RTE_CRYPTO_ASYM_H_ + +/** + * @file rte_crypto_asym.h + * + * RTE Definitions for Asymmetric Cryptography + * + * Defines asymmetric algorithms and modes, as well as supported + * asymmetric crypto operations. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include +#include + +typedef struct rte_crypto_param_t { + uint8_t *data; + /**< pointer to buffer holding data */ + rte_iova_t iova; + /**< IO address of data buffer */ + size_t length; + /**< length of data in bytes */ +} rte_crypto_param; + +/** asym xform type name strings */ +extern const char * +rte_crypto_asym_xform_strings[]; + +/** asym operations type name strings */ +extern const char * +rte_crypto_asym_op_strings[]; + +/** + * Asymmetric crypto transformation types. + * Each xform type maps to one asymmetric algorithm + * performing specific operation + * + */ +enum rte_crypto_asym_xform_type { + RTE_CRYPTO_ASYM_XFORM_UNSPECIFIED = 0, + /**< Invalid xform. */ + RTE_CRYPTO_ASYM_XFORM_NONE, + /**< Xform type None. + * May be supported by PMD to support + * passthrough op for debugging purpose. + * if xform_type none , op_type is disregarded. + */ + RTE_CRYPTO_ASYM_XFORM_RSA, + /**< RSA. Performs Encrypt, Decrypt, Sign and Verify. + * Refer to rte_crypto_asym_op_type + */ + RTE_CRYPTO_ASYM_XFORM_DH, + /**< Diffie-Hellman. + * Performs Key Generate and Shared Secret Compute. + * Refer to rte_crypto_asym_op_type + */ + RTE_CRYPTO_ASYM_XFORM_DSA, + /**< Digital Signature Algorithm + * Performs Signature Generation and Verification. + * Refer to rte_crypto_asym_op_type + */ + RTE_CRYPTO_ASYM_XFORM_MODINV, + /**< Modular Inverse + * Perform Modulus inverse b^(-1) mod n + */ + RTE_CRYPTO_ASYM_XFORM_MODEX, + /**< Modular Exponentiation + * Perform Modular Exponentiation b^e mod n + */ + RTE_CRYPTO_ASYM_XFORM_TYPE_LIST_END + /**< End of list */ +}; + +/** + * Asymmetric crypto operation type variants + */ +enum rte_crypto_asym_op_type { + RTE_CRYPTO_ASYM_OP_ENCRYPT, + /**< Asymmetric Encrypt operation */ + RTE_CRYPTO_ASYM_OP_DECRYPT, + /**< Asymmetric Decrypt operation */ + RTE_CRYPTO_ASYM_OP_SIGN, + /**< Signature Generation operation */ + RTE_CRYPTO_ASYM_OP_VERIFY, + /**< Signature Verification operation */ + RTE_CRYPTO_ASYM_OP_PRIVATE_KEY_GENERATE, + /**< DH Private Key generation operation */ + RTE_CRYPTO_ASYM_OP_PUBLIC_KEY_GENERATE, + /**< DH Public Key generation operation */ + RTE_CRYPTO_ASYM_OP_SHARED_SECRET_COMPUTE, + /**< DH Shared Secret compute operation */ + RTE_CRYPTO_ASYM_OP_LIST_END +}; + +/** + * Padding types for RSA signature. + */ +enum rte_crypto_rsa_padding_type { + RTE_CRYPTO_RSA_PADDING_NONE = 0, + /**< RSA no padding scheme */ + RTE_CRYPTO_RSA_PKCS1_V1_5_BT0, + /**< RSA PKCS#1 V1.5 Block Type 0 padding scheme + * as descibed in rfc2313 + */ + RTE_CRYPTO_RSA_PKCS1_V1_5_BT1, + /**< RSA PKCS#1 V1.5 Block Type 01 padding scheme + * as descibed in rfc2313 + */ + RTE_CRYPTO_RSA_PKCS1_V1_5_BT2, + /**< RSA PKCS#1 V1.5 Block Type 02 padding scheme + * as descibed in rfc2313 + */ + RTE_CRYPTO_RSA_PADDING_OAEP, + /**< RSA PKCS#1 OAEP padding scheme */ + RTE_CRYPTO_RSA_PADDING_PSS, + /**< RSA PKCS#1 PSS padding scheme */ + RTE_CRYPTO_RSA_PADDING_TYPE_LIST_END +}; + +/** + * RSA private key type enumeration + * + * enumerates private key format required to perform RSA crypto + * transform. + * + */ +enum rte_crypto_rsa_priv_key_type { + RTE_RSA_KEY_TYPE_EXP, + /**< RSA private key is an exponent */ + RTE_RSA_KET_TYPE_QT, + /**< RSA private key is in quintuple format + * See rte_crypto_rsa_priv_key_qt + */ +}; + +/** + * Structure describing RSA private key in quintuple format. + * See PKCS V1.5 RSA Cryptography Standard. + */ +struct rte_crypto_rsa_priv_key_qt { + rte_crypto_param p; + /**< p - Private key component P + * Private key component of RSA parameter required for CRT method + * of private key operations in Octet-string network byte order + * format. + */ + + rte_crypto_param q; + /**< q - Private key component Q + * Private key component of RSA parameter required for CRT method + * of private key operations in Octet-string network byte order + * format. + */ + + rte_crypto_param dP; + /**< dP - Private CRT component + * Private CRT component of RSA parameter required for CRT method + * RSA private key operations in Octet-string network byte order + * format. + * dP = d mod ( p - 1 ) + */ + + rte_crypto_param dQ; + /**< dQ - Private CRT component + * Private CRT component of RSA parameter required for CRT method + * RSA private key operations in Octet-string network byte order + * format. + * dQ = d mod ( q - 1 ) + */ + + rte_crypto_param qInv; + /**< qInv - Private CRT component + * Private CRT component of RSA parameter required for CRT method + * RSA private key operations in Octet-string network byte order + * format. + * qInv = inv q mod p + */ +}; + +/** + * Asymmetric RSA transform data + * + * Structure describing RSA xform params + * + */ +struct rte_crypto_rsa_xform { + rte_crypto_param n; + /**< n - Prime modulus + * Prime modulus data of RSA operation in Octet-string network + * byte order format. + */ + + rte_crypto_param e; + /**< e - Public key exponent + * Public key exponent used for RSA public key operations in Octet- + * string network byte order format. + */ + + enum rte_crypto_rsa_priv_key_type key_type; + + __extension__ + union { + rte_crypto_param d; + /**< d - Private key exponent + * Private key exponent used for RSA + * private key operations in + * Octet-string network byte order format. + */ + + struct rte_crypto_rsa_priv_key_qt qt; + /**< qt - Private key in quintuple format */ + }; +}; + +/** + * Asymmetric Modular exponentiation transform data + * + * Structure describing modular exponentation xform param + * + */ +struct rte_crypto_modex_xform { + rte_crypto_param modulus; + /**< modulus + * Prime modulus of the modexp transform operation in octet-string + * network byte order format. + */ + + rte_crypto_param exponent; + /**< exponent + * Private exponent of the modexp transform operation in + * octet-string network byte order format. + */ +}; + +/** + * Asymmetric modular inverse transform operation + * + * Structure describing modulus inverse xform params + * + */ +struct rte_crypto_modinv_xform { + rte_crypto_param modulus; + /**< + * Pointer to the prime modulus data for modular + * inverse operation in octet-string network byte + * order format. + */ +}; + +/** + * Asymmetric DH transform data + * + * Structure describing deffie-hellman xform params + * + */ +struct rte_crypto_dh_xform { + enum rte_crypto_asym_op_type type; + /**< Setup xform for key generate or shared secret compute */ + + rte_crypto_param p; + /**< p : Prime modulus data + * DH prime modulous data in octet-string network byte order format. + * + */ + + rte_crypto_param g; + /**< g : Generator + * DH group generator data in octet-string network byte order + * format. + * + */ +}; + +/** + * Asymmetric Digital Signature transform operation + * + * Structure describing DSA xform params + * + */ +struct rte_crypto_dsa_xform { + rte_crypto_param p; + /**< p - Prime modulus + * Prime modulus data for DSA operation in Octet-string network byte + * order format. + */ + rte_crypto_param q; + /**< q : Order of the subgroup. + * Order of the subgroup data in Octet-string network byte order + * format. + * (p-1) % q = 0 + */ + rte_crypto_param g; + /**< g: Generator of the subgroup + * Generator data in Octet-string network byte order format. + */ + rte_crypto_param x; + /**< x: Private key of the signer in octet-string network + * byte order format. + * Used when app has pre-defined private key. + * Valid only when xform chain is DSA ONLY. + * if xform chain is DH private key generate + DSA, then DSA sign + * compute will use internally generated key. + */ +}; + +/** + * Operations params for modular operations: + * exponentiation and invert + * + */ +struct rte_crypto_mod_op_param { + rte_crypto_param base; + /**< + * Pointer to base of modular exponentiation/inversion data in + * Octet-string network byte order format. + */ +}; + +/** + * Asymmetric crypto transform data + * + * Structure describing asym xforms. + */ +struct rte_crypto_asym_xform { + struct rte_crypto_asym_xform *next; + /**< Pointer to next xform to set up xform chain.*/ + enum rte_crypto_asym_xform_type xform_type; + /**< Asymmetric crypto transform */ + + __extension__ + union { + struct rte_crypto_rsa_xform rsa; + /**< RSA xform parameters */ + + struct rte_crypto_modex_xform modex; + /**< Modular Exponentiation xform parameters */ + + struct rte_crypto_modinv_xform modinv; + /**< Modulus Inverse xform parameters */ + + struct rte_crypto_dh_xform dh; + /**< DH xform parameters */ + + struct rte_crypto_dsa_xform dsa; + /**< DSA xform parameters */ + }; +}; + +struct rte_cryptodev_asym_session; + +/** + * RSA operation params + * + */ +struct rte_crypto_rsa_op_param { + enum rte_crypto_asym_op_type op_type; + /**< Type of RSA operation for transform */; + + rte_crypto_param message; + /**< + * Pointer to data + * - to be encrypted for RSA public encrypt. + * - to be decrypted for RSA private decrypt. + * - to be signed for RSA sign generation. + * - to be authenticated for RSA sign verification. + */ + + rte_crypto_param sign; + /**< + * Pointer to RSA signature data. If operation is RSA + * sign @ref RTE_CRYPTO_ASYM_OP_SIGN, buffer will be + * over-written with generated signature. + * + * Length of the signature data will be equal to the + * RSA prime modulus length. + */ + + enum rte_crypto_rsa_padding_type pad; + /**< RSA padding scheme to be used for transform */ + + enum rte_crypto_auth_algorithm md; + /**< Hash algorithm to be used for data hash if padding + * scheme is either OAEP or PSS. Valid hash algorithms + * are: + * MD5, SHA1, SHA224, SHA256, SHA384, SHA512 + */ + + enum rte_crypto_auth_algorithm mgf1md; + /**< + * Hash algorithm to be used for mask generation if + * padding scheme is either OAEP or PSS. If padding + * scheme is unspecified data hash algorithm is used + * for mask generation. Valid hash algorithms are: + * MD5, SHA1, SHA224, SHA256, SHA384, SHA512 + */ +}; + +/** + * Diffie-Hellman Operations params. + * @note: + */ +struct rte_crypto_dh_op_param { + rte_crypto_param pub_key; + /**< + * Output generated public key when xform type is + * DH PUB_KEY_GENERATION. + * Input peer public key when xform type is DH + * SHARED_SECRET_COMPUTATION + * pub_key is in octet-string network byte order format. + * + */ + + rte_crypto_param priv_key; + /**< + * Output generated private key if xform type is + * DH PRIVATE_KEY_GENERATION + * Input when xform type is DH SHARED_SECRET_COMPUTATION. + * priv_key is in octet-string network byte order format. + * + */ + + rte_crypto_param shared_secret; + /**< + * Output with calculated shared secret + * when dh xform set up with op type = SHARED_SECRET_COMPUTATION. + * shared_secret is an octet-string network byte order format. + * + */ +}; + +/** + * DSA Operations params + * + */ +struct rte_crypto_dsa_op_param { + enum rte_crypto_asym_op_type op_type; + /**< Signature Generation or Verification */ + rte_crypto_param message; + /**< input message to be signed or verified */ + rte_crypto_param r; + /**< dsa sign component 'r' value + * + * output if op_type = sign generate, + * input if op_type = sign verify + */ + rte_crypto_param s; + /**< dsa sign component 's' value + * + * output if op_type = sign generate, + * input if op_type = sign verify + */ + rte_crypto_param y; + /**< y : Public key of the signer. + * Public key data of the signer in Octet-string network byte order + * format. + * y = g^x mod p + */ +}; + +/** + * Asymmetric Cryptographic Operation. + * + * Structure describing asymmetric crypto operation params. + * + */ +struct rte_crypto_asym_op { + struct rte_cryptodev_asym_session *session; + /**< Handle for the initialised session context */ + + __extension__ + union { + struct rte_crypto_rsa_op_param rsa; + struct rte_crypto_mod_op_param modex; + struct rte_crypto_mod_op_param modinv; + struct rte_crypto_dh_op_param dh; + struct rte_crypto_dsa_op_param dsa; + }; +} __rte_cache_aligned; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRYPTO_ASYM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_sym.h b/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_sym.h new file mode 100644 index 00000000..eb5afc5e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_crypto_sym.h @@ -0,0 +1,740 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Intel Corporation + */ + +#ifndef _RTE_CRYPTO_SYM_H_ +#define _RTE_CRYPTO_SYM_H_ + +/** + * @file rte_crypto_sym.h + * + * RTE Definitions for Symmetric Cryptography + * + * Defines symmetric cipher and authentication algorithms and modes, as well + * as supported symmetric crypto operation combinations. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include + + +/** Symmetric Cipher Algorithms */ +enum rte_crypto_cipher_algorithm { + RTE_CRYPTO_CIPHER_NULL = 1, + /**< NULL cipher algorithm. No mode applies to the NULL algorithm. */ + + RTE_CRYPTO_CIPHER_3DES_CBC, + /**< Triple DES algorithm in CBC mode */ + RTE_CRYPTO_CIPHER_3DES_CTR, + /**< Triple DES algorithm in CTR mode */ + RTE_CRYPTO_CIPHER_3DES_ECB, + /**< Triple DES algorithm in ECB mode */ + + RTE_CRYPTO_CIPHER_AES_CBC, + /**< AES algorithm in CBC mode */ + RTE_CRYPTO_CIPHER_AES_CTR, + /**< AES algorithm in Counter mode */ + RTE_CRYPTO_CIPHER_AES_ECB, + /**< AES algorithm in ECB mode */ + RTE_CRYPTO_CIPHER_AES_F8, + /**< AES algorithm in F8 mode */ + RTE_CRYPTO_CIPHER_AES_XTS, + /**< AES algorithm in XTS mode */ + + RTE_CRYPTO_CIPHER_ARC4, + /**< (A)RC4 cipher algorithm */ + + RTE_CRYPTO_CIPHER_KASUMI_F8, + /**< KASUMI algorithm in F8 mode */ + + RTE_CRYPTO_CIPHER_SNOW3G_UEA2, + /**< SNOW 3G algorithm in UEA2 mode */ + + RTE_CRYPTO_CIPHER_ZUC_EEA3, + /**< ZUC algorithm in EEA3 mode */ + + RTE_CRYPTO_CIPHER_DES_CBC, + /**< DES algorithm in CBC mode */ + + RTE_CRYPTO_CIPHER_AES_DOCSISBPI, + /**< AES algorithm using modes required by + * DOCSIS Baseline Privacy Plus Spec. + * Chained mbufs are not supported in this mode, i.e. rte_mbuf.next + * for m_src and m_dst in the rte_crypto_sym_op must be NULL. + */ + + RTE_CRYPTO_CIPHER_DES_DOCSISBPI, + /**< DES algorithm using modes required by + * DOCSIS Baseline Privacy Plus Spec. + * Chained mbufs are not supported in this mode, i.e. rte_mbuf.next + * for m_src and m_dst in the rte_crypto_sym_op must be NULL. + */ + + RTE_CRYPTO_CIPHER_LIST_END + +}; + +/** Cipher algorithm name strings */ +extern const char * +rte_crypto_cipher_algorithm_strings[]; + +/** Symmetric Cipher Direction */ +enum rte_crypto_cipher_operation { + RTE_CRYPTO_CIPHER_OP_ENCRYPT, + /**< Encrypt cipher operation */ + RTE_CRYPTO_CIPHER_OP_DECRYPT + /**< Decrypt cipher operation */ +}; + +/** Cipher operation name strings */ +extern const char * +rte_crypto_cipher_operation_strings[]; + +/** + * Symmetric Cipher Setup Data. + * + * This structure contains data relating to Cipher (Encryption and Decryption) + * use to create a session. + */ +struct rte_crypto_cipher_xform { + enum rte_crypto_cipher_operation op; + /**< This parameter determines if the cipher operation is an encrypt or + * a decrypt operation. For the RC4 algorithm and the F8/CTR modes, + * only encrypt operations are valid. + */ + enum rte_crypto_cipher_algorithm algo; + /**< Cipher algorithm */ + + struct { + uint8_t *data; /**< pointer to key data */ + uint16_t length;/**< key length in bytes */ + } key; + /**< Cipher key + * + * For the RTE_CRYPTO_CIPHER_AES_F8 mode of operation, key.data will + * point to a concatenation of the AES encryption key followed by a + * keymask. As per RFC3711, the keymask should be padded with trailing + * bytes to match the length of the encryption key used. + * + * For AES-XTS mode of operation, two keys must be provided and + * key.data must point to the two keys concatenated together (Key1 || + * Key2). The cipher key length will contain the total size of both + * keys. + * + * Cipher key length is in bytes. For AES it can be 128 bits (16 bytes), + * 192 bits (24 bytes) or 256 bits (32 bytes). + * + * For the RTE_CRYPTO_CIPHER_AES_F8 mode of operation, key.length + * should be set to the combined length of the encryption key and the + * keymask. Since the keymask and the encryption key are the same size, + * key.length should be set to 2 x the AES encryption key length. + * + * For the AES-XTS mode of operation: + * - Two keys must be provided and key.length refers to total length of + * the two keys. + * - Each key can be either 128 bits (16 bytes) or 256 bits (32 bytes). + * - Both keys must have the same size. + **/ + struct { + uint16_t offset; + /**< Starting point for Initialisation Vector or Counter, + * specified as number of bytes from start of crypto + * operation (rte_crypto_op). + * + * - For block ciphers in CBC or F8 mode, or for KASUMI + * in F8 mode, or for SNOW 3G in UEA2 mode, this is the + * Initialisation Vector (IV) value. + * + * - For block ciphers in CTR mode, this is the counter. + * + * - For GCM mode, this is either the IV (if the length + * is 96 bits) or J0 (for other sizes), where J0 is as + * defined by NIST SP800-38D. Regardless of the IV + * length, a full 16 bytes needs to be allocated. + * + * - For CCM mode, the first byte is reserved, and the + * nonce should be written starting at &iv[1] (to allow + * space for the implementation to write in the flags + * in the first byte). Note that a full 16 bytes should + * be allocated, even though the length field will + * have a value less than this. Note that the PMDs may + * modify the memory reserved (the first byte and the + * final padding) + * + * - For AES-XTS, this is the 128bit tweak, i, from + * IEEE Std 1619-2007. + * + * For optimum performance, the data pointed to SHOULD + * be 8-byte aligned. + */ + uint16_t length; + /**< Length of valid IV data. + * + * - For block ciphers in CBC or F8 mode, or for KASUMI + * in F8 mode, or for SNOW 3G in UEA2 mode, this is the + * length of the IV (which must be the same as the + * block length of the cipher). + * + * - For block ciphers in CTR mode, this is the length + * of the counter (which must be the same as the block + * length of the cipher). + * + * - For GCM mode, this is either 12 (for 96-bit IVs) + * or 16, in which case data points to J0. + * + * - For CCM mode, this is the length of the nonce, + * which can be in the range 7 to 13 inclusive. + */ + } iv; /**< Initialisation vector parameters */ +}; + +/** Symmetric Authentication / Hash Algorithms */ +enum rte_crypto_auth_algorithm { + RTE_CRYPTO_AUTH_NULL = 1, + /**< NULL hash algorithm. */ + + RTE_CRYPTO_AUTH_AES_CBC_MAC, + /**< AES-CBC-MAC algorithm. Only 128-bit keys are supported. */ + RTE_CRYPTO_AUTH_AES_CMAC, + /**< AES CMAC algorithm. */ + RTE_CRYPTO_AUTH_AES_GMAC, + /**< AES GMAC algorithm. */ + RTE_CRYPTO_AUTH_AES_XCBC_MAC, + /**< AES XCBC algorithm. */ + + RTE_CRYPTO_AUTH_KASUMI_F9, + /**< KASUMI algorithm in F9 mode. */ + + RTE_CRYPTO_AUTH_MD5, + /**< MD5 algorithm */ + RTE_CRYPTO_AUTH_MD5_HMAC, + /**< HMAC using MD5 algorithm */ + + RTE_CRYPTO_AUTH_SHA1, + /**< 128 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA1_HMAC, + /**< HMAC using 128 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA224, + /**< 224 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA224_HMAC, + /**< HMAC using 224 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA256, + /**< 256 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA256_HMAC, + /**< HMAC using 256 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA384, + /**< 384 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA384_HMAC, + /**< HMAC using 384 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA512, + /**< 512 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA512_HMAC, + /**< HMAC using 512 bit SHA algorithm. */ + + RTE_CRYPTO_AUTH_SNOW3G_UIA2, + /**< SNOW 3G algorithm in UIA2 mode. */ + + RTE_CRYPTO_AUTH_ZUC_EIA3, + /**< ZUC algorithm in EIA3 mode */ + + RTE_CRYPTO_AUTH_SHA3_224, + /**< 224 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_224_HMAC, + /**< HMAC using 224 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_256, + /**< 256 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_256_HMAC, + /**< HMAC using 256 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_384, + /**< 384 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_384_HMAC, + /**< HMAC using 384 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_512, + /**< 512 bit SHA3 algorithm. */ + RTE_CRYPTO_AUTH_SHA3_512_HMAC, + /**< HMAC using 512 bit SHA3 algorithm. */ + + RTE_CRYPTO_AUTH_LIST_END +}; + +/** Authentication algorithm name strings */ +extern const char * +rte_crypto_auth_algorithm_strings[]; + +/** Symmetric Authentication / Hash Operations */ +enum rte_crypto_auth_operation { + RTE_CRYPTO_AUTH_OP_VERIFY, /**< Verify authentication digest */ + RTE_CRYPTO_AUTH_OP_GENERATE /**< Generate authentication digest */ +}; + +/** Authentication operation name strings */ +extern const char * +rte_crypto_auth_operation_strings[]; + +/** + * Authentication / Hash transform data. + * + * This structure contains data relating to an authentication/hash crypto + * transforms. The fields op, algo and digest_length are common to all + * authentication transforms and MUST be set. + */ +struct rte_crypto_auth_xform { + enum rte_crypto_auth_operation op; + /**< Authentication operation type */ + enum rte_crypto_auth_algorithm algo; + /**< Authentication algorithm selection */ + + struct { + uint8_t *data; /**< pointer to key data */ + uint16_t length;/**< key length in bytes */ + } key; + /**< Authentication key data. + * The authentication key length MUST be less than or equal to the + * block size of the algorithm. It is the callers responsibility to + * ensure that the key length is compliant with the standard being used + * (for example RFC 2104, FIPS 198a). + */ + + struct { + uint16_t offset; + /**< Starting point for Initialisation Vector or Counter, + * specified as number of bytes from start of crypto + * operation (rte_crypto_op). + * + * - For SNOW 3G in UIA2 mode, for ZUC in EIA3 mode and + * for AES-GMAC, this is the authentication + * Initialisation Vector (IV) value. + * + * - For KASUMI in F9 mode and other authentication + * algorithms, this field is not used. + * + * For optimum performance, the data pointed to SHOULD + * be 8-byte aligned. + */ + uint16_t length; + /**< Length of valid IV data. + * + * - For SNOW3G in UIA2 mode, for ZUC in EIA3 mode and + * for AES-GMAC, this is the length of the IV. + * + * - For KASUMI in F9 mode and other authentication + * algorithms, this field is not used. + * + */ + } iv; /**< Initialisation vector parameters */ + + uint16_t digest_length; + /**< Length of the digest to be returned. If the verify option is set, + * this specifies the length of the digest to be compared for the + * session. + * + * It is the caller's responsibility to ensure that the + * digest length is compliant with the hash algorithm being used. + * If the value is less than the maximum length allowed by the hash, + * the result shall be truncated. + */ +}; + + +/** Symmetric AEAD Algorithms */ +enum rte_crypto_aead_algorithm { + RTE_CRYPTO_AEAD_AES_CCM = 1, + /**< AES algorithm in CCM mode. */ + RTE_CRYPTO_AEAD_AES_GCM, + /**< AES algorithm in GCM mode. */ + RTE_CRYPTO_AEAD_LIST_END +}; + +/** AEAD algorithm name strings */ +extern const char * +rte_crypto_aead_algorithm_strings[]; + +/** Symmetric AEAD Operations */ +enum rte_crypto_aead_operation { + RTE_CRYPTO_AEAD_OP_ENCRYPT, + /**< Encrypt and generate digest */ + RTE_CRYPTO_AEAD_OP_DECRYPT + /**< Verify digest and decrypt */ +}; + +/** Authentication operation name strings */ +extern const char * +rte_crypto_aead_operation_strings[]; + +struct rte_crypto_aead_xform { + enum rte_crypto_aead_operation op; + /**< AEAD operation type */ + enum rte_crypto_aead_algorithm algo; + /**< AEAD algorithm selection */ + + struct { + uint8_t *data; /**< pointer to key data */ + uint16_t length;/**< key length in bytes */ + } key; + + struct { + uint16_t offset; + /**< Starting point for Initialisation Vector or Counter, + * specified as number of bytes from start of crypto + * operation (rte_crypto_op). + * + * - For GCM mode, this is either the IV (if the length + * is 96 bits) or J0 (for other sizes), where J0 is as + * defined by NIST SP800-38D. Regardless of the IV + * length, a full 16 bytes needs to be allocated. + * + * - For CCM mode, the first byte is reserved, and the + * nonce should be written starting at &iv[1] (to allow + * space for the implementation to write in the flags + * in the first byte). Note that a full 16 bytes should + * be allocated, even though the length field will + * have a value less than this. + * + * For optimum performance, the data pointed to SHOULD + * be 8-byte aligned. + */ + uint16_t length; + /**< Length of valid IV data. + * + * - For GCM mode, this is either 12 (for 96-bit IVs) + * or 16, in which case data points to J0. + * + * - For CCM mode, this is the length of the nonce, + * which can be in the range 7 to 13 inclusive. + */ + } iv; /**< Initialisation vector parameters */ + + uint16_t digest_length; + + uint16_t aad_length; + /**< The length of the additional authenticated data (AAD) in bytes. + * For CCM mode, this is the length of the actual AAD, even though + * it is required to reserve 18 bytes before the AAD and padding + * at the end of it, so a multiple of 16 bytes is allocated. + */ +}; + +/** Crypto transformation types */ +enum rte_crypto_sym_xform_type { + RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED = 0, /**< No xform specified */ + RTE_CRYPTO_SYM_XFORM_AUTH, /**< Authentication xform */ + RTE_CRYPTO_SYM_XFORM_CIPHER, /**< Cipher xform */ + RTE_CRYPTO_SYM_XFORM_AEAD /**< AEAD xform */ +}; + +/** + * Symmetric crypto transform structure. + * + * This is used to specify the crypto transforms required, multiple transforms + * can be chained together to specify a chain transforms such as authentication + * then cipher, or cipher then authentication. Each transform structure can + * hold a single transform, the type field is used to specify which transform + * is contained within the union + */ +struct rte_crypto_sym_xform { + struct rte_crypto_sym_xform *next; + /**< next xform in chain */ + enum rte_crypto_sym_xform_type type + ; /**< xform type */ + RTE_STD_C11 + union { + struct rte_crypto_auth_xform auth; + /**< Authentication / hash xform */ + struct rte_crypto_cipher_xform cipher; + /**< Cipher xform */ + struct rte_crypto_aead_xform aead; + /**< AEAD xform */ + }; +}; + +struct rte_cryptodev_sym_session; + +/** + * Symmetric Cryptographic Operation. + * + * This structure contains data relating to performing symmetric cryptographic + * processing on a referenced mbuf data buffer. + * + * When a symmetric crypto operation is enqueued with the device for processing + * it must have a valid *rte_mbuf* structure attached, via m_src parameter, + * which contains the source data which the crypto operation is to be performed + * on. + * While the mbuf is in use by a crypto operation no part of the mbuf should be + * changed by the application as the device may read or write to any part of the + * mbuf. In the case of hardware crypto devices some or all of the mbuf + * may be DMAed in and out of the device, so writing over the original data, + * though only the part specified by the rte_crypto_sym_op for transformation + * will be changed. + * Out-of-place (OOP) operation, where the source mbuf is different to the + * destination mbuf, is a special case. Data will be copied from m_src to m_dst. + * The part copied includes all the parts of the source mbuf that will be + * operated on, based on the cipher.data.offset+cipher.data.length and + * auth.data.offset+auth.data.length values in the rte_crypto_sym_op. The part + * indicated by the cipher parameters will be transformed, any extra data around + * this indicated by the auth parameters will be copied unchanged from source to + * destination mbuf. + * Also in OOP operation the cipher.data.offset and auth.data.offset apply to + * both source and destination mbufs. As these offsets are relative to the + * data_off parameter in each mbuf this can result in the data written to the + * destination buffer being at a different alignment, relative to buffer start, + * to the data in the source buffer. + */ +struct rte_crypto_sym_op { + struct rte_mbuf *m_src; /**< source mbuf */ + struct rte_mbuf *m_dst; /**< destination mbuf */ + + RTE_STD_C11 + union { + struct rte_cryptodev_sym_session *session; + /**< Handle for the initialised session context */ + struct rte_crypto_sym_xform *xform; + /**< Session-less API crypto operation parameters */ + struct rte_security_session *sec_session; + /**< Handle for the initialised security session context */ + }; + + RTE_STD_C11 + union { + struct { + struct { + uint32_t offset; + /**< Starting point for AEAD processing, specified as + * number of bytes from start of packet in source + * buffer. + */ + uint32_t length; + /**< The message length, in bytes, of the source buffer + * on which the cryptographic operation will be + * computed. This must be a multiple of the block size + */ + } data; /**< Data offsets and length for AEAD */ + struct { + uint8_t *data; + /**< This points to the location where the digest result + * should be inserted (in the case of digest generation) + * or where the purported digest exists (in the case of + * digest verification). + * + * At session creation time, the client specified the + * digest result length with the digest_length member + * of the @ref rte_crypto_auth_xform structure. For + * physical crypto devices the caller must allocate at + * least digest_length of physically contiguous memory + * at this location. + * + * For digest generation, the digest result will + * overwrite any data at this location. + * + * @note + * For GCM (@ref RTE_CRYPTO_AEAD_AES_GCM), for + * "digest result" read "authentication tag T". + */ + rte_iova_t phys_addr; + /**< Physical address of digest */ + } digest; /**< Digest parameters */ + struct { + uint8_t *data; + /**< Pointer to Additional Authenticated Data (AAD) + * needed for authenticated cipher mechanisms (CCM and + * GCM) + * + * Specifically for CCM (@ref RTE_CRYPTO_AEAD_AES_CCM), + * the caller should setup this field as follows: + * + * - the additional authentication data itself should + * be written starting at an offset of 18 bytes into + * the array, leaving room for the first block (16 bytes) + * and the length encoding in the first two bytes of the + * second block. + * + * - the array should be big enough to hold the above + * fields, plus any padding to round this up to the + * nearest multiple of the block size (16 bytes). + * Padding will be added by the implementation. + * + * - Note that PMDs may modify the memory reserved + * (first 18 bytes and the final padding). + * + * Finally, for GCM (@ref RTE_CRYPTO_AEAD_AES_GCM), the + * caller should setup this field as follows: + * + * - the AAD is written in starting at byte 0 + * - the array must be big enough to hold the AAD, plus + * any space to round this up to the nearest multiple + * of the block size (16 bytes). + * + */ + rte_iova_t phys_addr; /**< physical address */ + } aad; + /**< Additional authentication parameters */ + } aead; + + struct { + struct { + struct { + uint32_t offset; + /**< Starting point for cipher processing, + * specified as number of bytes from start + * of data in the source buffer. + * The result of the cipher operation will be + * written back into the output buffer + * starting at this location. + * + * @note + * For SNOW 3G @ RTE_CRYPTO_CIPHER_SNOW3G_UEA2, + * KASUMI @ RTE_CRYPTO_CIPHER_KASUMI_F8 + * and ZUC @ RTE_CRYPTO_CIPHER_ZUC_EEA3, + * this field should be in bits. + */ + uint32_t length; + /**< The message length, in bytes, of the + * source buffer on which the cryptographic + * operation will be computed. + * This must be a multiple of the block size + * if a block cipher is being used. This is + * also the same as the result length. + * + * @note + * For SNOW 3G @ RTE_CRYPTO_AUTH_SNOW3G_UEA2, + * KASUMI @ RTE_CRYPTO_CIPHER_KASUMI_F8 + * and ZUC @ RTE_CRYPTO_CIPHER_ZUC_EEA3, + * this field should be in bits. + */ + } data; /**< Data offsets and length for ciphering */ + } cipher; + + struct { + struct { + uint32_t offset; + /**< Starting point for hash processing, + * specified as number of bytes from start of + * packet in source buffer. + * + * @note + * For SNOW 3G @ RTE_CRYPTO_AUTH_SNOW3G_UIA2, + * KASUMI @ RTE_CRYPTO_AUTH_KASUMI_F9 + * and ZUC @ RTE_CRYPTO_AUTH_ZUC_EIA3, + * this field should be in bits. + * + * @note + * For KASUMI @ RTE_CRYPTO_AUTH_KASUMI_F9, + * this offset should be such that + * data to authenticate starts at COUNT. + */ + uint32_t length; + /**< The message length, in bytes, of the source + * buffer that the hash will be computed on. + * + * @note + * For SNOW 3G @ RTE_CRYPTO_AUTH_SNOW3G_UIA2, + * KASUMI @ RTE_CRYPTO_AUTH_KASUMI_F9 + * and ZUC @ RTE_CRYPTO_AUTH_ZUC_EIA3, + * this field should be in bits. + * + * @note + * For KASUMI @ RTE_CRYPTO_AUTH_KASUMI_F9, + * the length should include the COUNT, + * FRESH, message, direction bit and padding + * (to be multiple of 8 bits). + */ + } data; + /**< Data offsets and length for authentication */ + + struct { + uint8_t *data; + /**< This points to the location where + * the digest result should be inserted + * (in the case of digest generation) + * or where the purported digest exists + * (in the case of digest verification). + * + * At session creation time, the client + * specified the digest result length with + * the digest_length member of the + * @ref rte_crypto_auth_xform structure. + * For physical crypto devices the caller + * must allocate at least digest_length of + * physically contiguous memory at this + * location. + * + * For digest generation, the digest result + * will overwrite any data at this location. + * + */ + rte_iova_t phys_addr; + /**< Physical address of digest */ + } digest; /**< Digest parameters */ + } auth; + }; + }; +}; + + +/** + * Reset the fields of a symmetric operation to their default values. + * + * @param op The crypto operation to be reset. + */ +static inline void +__rte_crypto_sym_op_reset(struct rte_crypto_sym_op *op) +{ + memset(op, 0, sizeof(*op)); +} + + +/** + * Allocate space for symmetric crypto xforms in the private data space of the + * crypto operation. This also defaults the crypto xform type to + * RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED and configures the chaining of the xforms + * in the crypto operation + * + * @return + * - On success returns pointer to first crypto xform in crypto operations chain + * - On failure returns NULL + */ +static inline struct rte_crypto_sym_xform * +__rte_crypto_sym_op_sym_xforms_alloc(struct rte_crypto_sym_op *sym_op, + void *priv_data, uint8_t nb_xforms) +{ + struct rte_crypto_sym_xform *xform; + + sym_op->xform = xform = (struct rte_crypto_sym_xform *)priv_data; + + do { + xform->type = RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED; + xform = xform->next = --nb_xforms > 0 ? xform + 1 : NULL; + } while (xform); + + return sym_op->xform; +} + + +/** + * Attach a session to a symmetric crypto operation + * + * @param sym_op crypto operation + * @param sess cryptodev session + */ +static inline int +__rte_crypto_sym_op_attach_sym_session(struct rte_crypto_sym_op *sym_op, + struct rte_cryptodev_sym_session *sess) +{ + sym_op->session = sess; + + return 0; +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRYPTO_SYM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.c b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.c new file mode 100644 index 00000000..8ce97092 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.c @@ -0,0 +1,1618 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2017 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_crypto.h" +#include "rte_cryptodev.h" +#include "rte_cryptodev_pmd.h" + +static uint8_t nb_drivers; + +struct rte_cryptodev rte_crypto_devices[RTE_CRYPTO_MAX_DEVS]; + +struct rte_cryptodev *rte_cryptodevs = &rte_crypto_devices[0]; + +static struct rte_cryptodev_global cryptodev_globals = { + .devs = &rte_crypto_devices[0], + .data = { NULL }, + .nb_devs = 0, + .max_devs = RTE_CRYPTO_MAX_DEVS +}; + +struct rte_cryptodev_global *rte_cryptodev_globals = &cryptodev_globals; + +/* spinlock for crypto device callbacks */ +static rte_spinlock_t rte_cryptodev_cb_lock = RTE_SPINLOCK_INITIALIZER; + + +/** + * The user application callback description. + * + * It contains callback address to be registered by user application, + * the pointer to the parameters for callback, and the event type. + */ +struct rte_cryptodev_callback { + TAILQ_ENTRY(rte_cryptodev_callback) next; /**< Callbacks list */ + rte_cryptodev_cb_fn cb_fn; /**< Callback address */ + void *cb_arg; /**< Parameter for callback */ + enum rte_cryptodev_event_type event; /**< Interrupt event type */ + uint32_t active; /**< Callback is executing */ +}; + +/** + * The crypto cipher algorithm strings identifiers. + * It could be used in application command line. + */ +const char * +rte_crypto_cipher_algorithm_strings[] = { + [RTE_CRYPTO_CIPHER_3DES_CBC] = "3des-cbc", + [RTE_CRYPTO_CIPHER_3DES_ECB] = "3des-ecb", + [RTE_CRYPTO_CIPHER_3DES_CTR] = "3des-ctr", + + [RTE_CRYPTO_CIPHER_AES_CBC] = "aes-cbc", + [RTE_CRYPTO_CIPHER_AES_CTR] = "aes-ctr", + [RTE_CRYPTO_CIPHER_AES_DOCSISBPI] = "aes-docsisbpi", + [RTE_CRYPTO_CIPHER_AES_ECB] = "aes-ecb", + [RTE_CRYPTO_CIPHER_AES_F8] = "aes-f8", + [RTE_CRYPTO_CIPHER_AES_XTS] = "aes-xts", + + [RTE_CRYPTO_CIPHER_ARC4] = "arc4", + + [RTE_CRYPTO_CIPHER_DES_CBC] = "des-cbc", + [RTE_CRYPTO_CIPHER_DES_DOCSISBPI] = "des-docsisbpi", + + [RTE_CRYPTO_CIPHER_NULL] = "null", + + [RTE_CRYPTO_CIPHER_KASUMI_F8] = "kasumi-f8", + [RTE_CRYPTO_CIPHER_SNOW3G_UEA2] = "snow3g-uea2", + [RTE_CRYPTO_CIPHER_ZUC_EEA3] = "zuc-eea3" +}; + +/** + * The crypto cipher operation strings identifiers. + * It could be used in application command line. + */ +const char * +rte_crypto_cipher_operation_strings[] = { + [RTE_CRYPTO_CIPHER_OP_ENCRYPT] = "encrypt", + [RTE_CRYPTO_CIPHER_OP_DECRYPT] = "decrypt" +}; + +/** + * The crypto auth algorithm strings identifiers. + * It could be used in application command line. + */ +const char * +rte_crypto_auth_algorithm_strings[] = { + [RTE_CRYPTO_AUTH_AES_CBC_MAC] = "aes-cbc-mac", + [RTE_CRYPTO_AUTH_AES_CMAC] = "aes-cmac", + [RTE_CRYPTO_AUTH_AES_GMAC] = "aes-gmac", + [RTE_CRYPTO_AUTH_AES_XCBC_MAC] = "aes-xcbc-mac", + + [RTE_CRYPTO_AUTH_MD5] = "md5", + [RTE_CRYPTO_AUTH_MD5_HMAC] = "md5-hmac", + + [RTE_CRYPTO_AUTH_NULL] = "null", + + [RTE_CRYPTO_AUTH_SHA1] = "sha1", + [RTE_CRYPTO_AUTH_SHA1_HMAC] = "sha1-hmac", + + [RTE_CRYPTO_AUTH_SHA224] = "sha2-224", + [RTE_CRYPTO_AUTH_SHA224_HMAC] = "sha2-224-hmac", + [RTE_CRYPTO_AUTH_SHA256] = "sha2-256", + [RTE_CRYPTO_AUTH_SHA256_HMAC] = "sha2-256-hmac", + [RTE_CRYPTO_AUTH_SHA384] = "sha2-384", + [RTE_CRYPTO_AUTH_SHA384_HMAC] = "sha2-384-hmac", + [RTE_CRYPTO_AUTH_SHA512] = "sha2-512", + [RTE_CRYPTO_AUTH_SHA512_HMAC] = "sha2-512-hmac", + + [RTE_CRYPTO_AUTH_KASUMI_F9] = "kasumi-f9", + [RTE_CRYPTO_AUTH_SNOW3G_UIA2] = "snow3g-uia2", + [RTE_CRYPTO_AUTH_ZUC_EIA3] = "zuc-eia3" +}; + +/** + * The crypto AEAD algorithm strings identifiers. + * It could be used in application command line. + */ +const char * +rte_crypto_aead_algorithm_strings[] = { + [RTE_CRYPTO_AEAD_AES_CCM] = "aes-ccm", + [RTE_CRYPTO_AEAD_AES_GCM] = "aes-gcm", +}; + +/** + * The crypto AEAD operation strings identifiers. + * It could be used in application command line. + */ +const char * +rte_crypto_aead_operation_strings[] = { + [RTE_CRYPTO_AEAD_OP_ENCRYPT] = "encrypt", + [RTE_CRYPTO_AEAD_OP_DECRYPT] = "decrypt" +}; + +/** + * Asymmetric crypto transform operation strings identifiers. + */ +const char *rte_crypto_asym_xform_strings[] = { + [RTE_CRYPTO_ASYM_XFORM_NONE] = "none", + [RTE_CRYPTO_ASYM_XFORM_RSA] = "rsa", + [RTE_CRYPTO_ASYM_XFORM_MODEX] = "modexp", + [RTE_CRYPTO_ASYM_XFORM_MODINV] = "modinv", + [RTE_CRYPTO_ASYM_XFORM_DH] = "dh", + [RTE_CRYPTO_ASYM_XFORM_DSA] = "dsa", +}; + +/** + * Asymmetric crypto operation strings identifiers. + */ +const char *rte_crypto_asym_op_strings[] = { + [RTE_CRYPTO_ASYM_OP_ENCRYPT] = "encrypt", + [RTE_CRYPTO_ASYM_OP_DECRYPT] = "decrypt", + [RTE_CRYPTO_ASYM_OP_SIGN] = "sign", + [RTE_CRYPTO_ASYM_OP_VERIFY] = "verify", + [RTE_CRYPTO_ASYM_OP_PRIVATE_KEY_GENERATE] = "priv_key_generate", + [RTE_CRYPTO_ASYM_OP_PUBLIC_KEY_GENERATE] = "pub_key_generate", + [RTE_CRYPTO_ASYM_OP_SHARED_SECRET_COMPUTE] = "sharedsecret_compute", +}; + +int +rte_cryptodev_get_cipher_algo_enum(enum rte_crypto_cipher_algorithm *algo_enum, + const char *algo_string) +{ + unsigned int i; + + for (i = 1; i < RTE_DIM(rte_crypto_cipher_algorithm_strings); i++) { + if (strcmp(algo_string, rte_crypto_cipher_algorithm_strings[i]) == 0) { + *algo_enum = (enum rte_crypto_cipher_algorithm) i; + return 0; + } + } + + /* Invalid string */ + return -1; +} + +int +rte_cryptodev_get_auth_algo_enum(enum rte_crypto_auth_algorithm *algo_enum, + const char *algo_string) +{ + unsigned int i; + + for (i = 1; i < RTE_DIM(rte_crypto_auth_algorithm_strings); i++) { + if (strcmp(algo_string, rte_crypto_auth_algorithm_strings[i]) == 0) { + *algo_enum = (enum rte_crypto_auth_algorithm) i; + return 0; + } + } + + /* Invalid string */ + return -1; +} + +int +rte_cryptodev_get_aead_algo_enum(enum rte_crypto_aead_algorithm *algo_enum, + const char *algo_string) +{ + unsigned int i; + + for (i = 1; i < RTE_DIM(rte_crypto_aead_algorithm_strings); i++) { + if (strcmp(algo_string, rte_crypto_aead_algorithm_strings[i]) == 0) { + *algo_enum = (enum rte_crypto_aead_algorithm) i; + return 0; + } + } + + /* Invalid string */ + return -1; +} + +int __rte_experimental +rte_cryptodev_asym_get_xform_enum(enum rte_crypto_asym_xform_type *xform_enum, + const char *xform_string) +{ + unsigned int i; + + for (i = 1; i < RTE_DIM(rte_crypto_asym_xform_strings); i++) { + if (strcmp(xform_string, + rte_crypto_asym_xform_strings[i]) == 0) { + *xform_enum = (enum rte_crypto_asym_xform_type) i; + return 0; + } + } + + /* Invalid string */ + return -1; +} + +/** + * The crypto auth operation strings identifiers. + * It could be used in application command line. + */ +const char * +rte_crypto_auth_operation_strings[] = { + [RTE_CRYPTO_AUTH_OP_VERIFY] = "verify", + [RTE_CRYPTO_AUTH_OP_GENERATE] = "generate" +}; + +const struct rte_cryptodev_symmetric_capability * +rte_cryptodev_sym_capability_get(uint8_t dev_id, + const struct rte_cryptodev_sym_capability_idx *idx) +{ + const struct rte_cryptodev_capabilities *capability; + struct rte_cryptodev_info dev_info; + int i = 0; + + rte_cryptodev_info_get(dev_id, &dev_info); + + while ((capability = &dev_info.capabilities[i++])->op != + RTE_CRYPTO_OP_TYPE_UNDEFINED) { + if (capability->op != RTE_CRYPTO_OP_TYPE_SYMMETRIC) + continue; + + if (capability->sym.xform_type != idx->type) + continue; + + if (idx->type == RTE_CRYPTO_SYM_XFORM_AUTH && + capability->sym.auth.algo == idx->algo.auth) + return &capability->sym; + + if (idx->type == RTE_CRYPTO_SYM_XFORM_CIPHER && + capability->sym.cipher.algo == idx->algo.cipher) + return &capability->sym; + + if (idx->type == RTE_CRYPTO_SYM_XFORM_AEAD && + capability->sym.aead.algo == idx->algo.aead) + return &capability->sym; + } + + return NULL; + +} + +static int +param_range_check(uint16_t size, const struct rte_crypto_param_range *range) +{ + unsigned int next_size; + + /* Check lower/upper bounds */ + if (size < range->min) + return -1; + + if (size > range->max) + return -1; + + /* If range is actually only one value, size is correct */ + if (range->increment == 0) + return 0; + + /* Check if value is one of the supported sizes */ + for (next_size = range->min; next_size <= range->max; + next_size += range->increment) + if (size == next_size) + return 0; + + return -1; +} + +const struct rte_cryptodev_asymmetric_xform_capability * __rte_experimental +rte_cryptodev_asym_capability_get(uint8_t dev_id, + const struct rte_cryptodev_asym_capability_idx *idx) +{ + const struct rte_cryptodev_capabilities *capability; + struct rte_cryptodev_info dev_info; + unsigned int i = 0; + + memset(&dev_info, 0, sizeof(struct rte_cryptodev_info)); + rte_cryptodev_info_get(dev_id, &dev_info); + + while ((capability = &dev_info.capabilities[i++])->op != + RTE_CRYPTO_OP_TYPE_UNDEFINED) { + if (capability->op != RTE_CRYPTO_OP_TYPE_ASYMMETRIC) + continue; + + if (capability->asym.xform_capa.xform_type == idx->type) + return &capability->asym.xform_capa; + } + return NULL; +}; + +int +rte_cryptodev_sym_capability_check_cipher( + const struct rte_cryptodev_symmetric_capability *capability, + uint16_t key_size, uint16_t iv_size) +{ + if (param_range_check(key_size, &capability->cipher.key_size) != 0) + return -1; + + if (param_range_check(iv_size, &capability->cipher.iv_size) != 0) + return -1; + + return 0; +} + +int +rte_cryptodev_sym_capability_check_auth( + const struct rte_cryptodev_symmetric_capability *capability, + uint16_t key_size, uint16_t digest_size, uint16_t iv_size) +{ + if (param_range_check(key_size, &capability->auth.key_size) != 0) + return -1; + + if (param_range_check(digest_size, &capability->auth.digest_size) != 0) + return -1; + + if (param_range_check(iv_size, &capability->auth.iv_size) != 0) + return -1; + + return 0; +} + +int +rte_cryptodev_sym_capability_check_aead( + const struct rte_cryptodev_symmetric_capability *capability, + uint16_t key_size, uint16_t digest_size, uint16_t aad_size, + uint16_t iv_size) +{ + if (param_range_check(key_size, &capability->aead.key_size) != 0) + return -1; + + if (param_range_check(digest_size, &capability->aead.digest_size) != 0) + return -1; + + if (param_range_check(aad_size, &capability->aead.aad_size) != 0) + return -1; + + if (param_range_check(iv_size, &capability->aead.iv_size) != 0) + return -1; + + return 0; +} +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_optype( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + enum rte_crypto_asym_op_type op_type) +{ + if (capability->op_types & (1 << op_type)) + return 1; + + return 0; +} + +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_modlen( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + uint16_t modlen) +{ + /* no need to check for limits, if min or max = 0 */ + if (capability->modlen.min != 0) { + if (modlen < capability->modlen.min) + return -1; + } + + if (capability->modlen.max != 0) { + if (modlen > capability->modlen.max) + return -1; + } + + /* in any case, check if given modlen is module increment */ + if (capability->modlen.increment != 0) { + if (modlen % (capability->modlen.increment)) + return -1; + } + + return 0; +} + + +const char * +rte_cryptodev_get_feature_name(uint64_t flag) +{ + switch (flag) { + case RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO: + return "SYMMETRIC_CRYPTO"; + case RTE_CRYPTODEV_FF_ASYMMETRIC_CRYPTO: + return "ASYMMETRIC_CRYPTO"; + case RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING: + return "SYM_OPERATION_CHAINING"; + case RTE_CRYPTODEV_FF_CPU_SSE: + return "CPU_SSE"; + case RTE_CRYPTODEV_FF_CPU_AVX: + return "CPU_AVX"; + case RTE_CRYPTODEV_FF_CPU_AVX2: + return "CPU_AVX2"; + case RTE_CRYPTODEV_FF_CPU_AVX512: + return "CPU_AVX512"; + case RTE_CRYPTODEV_FF_CPU_AESNI: + return "CPU_AESNI"; + case RTE_CRYPTODEV_FF_HW_ACCELERATED: + return "HW_ACCELERATED"; + case RTE_CRYPTODEV_FF_IN_PLACE_SGL: + return "IN_PLACE_SGL"; + case RTE_CRYPTODEV_FF_OOP_SGL_IN_SGL_OUT: + return "OOP_SGL_IN_SGL_OUT"; + case RTE_CRYPTODEV_FF_OOP_SGL_IN_LB_OUT: + return "OOP_SGL_IN_LB_OUT"; + case RTE_CRYPTODEV_FF_OOP_LB_IN_SGL_OUT: + return "OOP_LB_IN_SGL_OUT"; + case RTE_CRYPTODEV_FF_OOP_LB_IN_LB_OUT: + return "OOP_LB_IN_LB_OUT"; + case RTE_CRYPTODEV_FF_CPU_NEON: + return "CPU_NEON"; + case RTE_CRYPTODEV_FF_CPU_ARM_CE: + return "CPU_ARM_CE"; + case RTE_CRYPTODEV_FF_SECURITY: + return "SECURITY_PROTOCOL"; + default: + return NULL; + } +} + +struct rte_cryptodev * +rte_cryptodev_pmd_get_dev(uint8_t dev_id) +{ + return &rte_cryptodev_globals->devs[dev_id]; +} + +struct rte_cryptodev * +rte_cryptodev_pmd_get_named_dev(const char *name) +{ + struct rte_cryptodev *dev; + unsigned int i; + + if (name == NULL) + return NULL; + + for (i = 0; i < rte_cryptodev_globals->max_devs; i++) { + dev = &rte_cryptodev_globals->devs[i]; + + if ((dev->attached == RTE_CRYPTODEV_ATTACHED) && + (strcmp(dev->data->name, name) == 0)) + return dev; + } + + return NULL; +} + +unsigned int +rte_cryptodev_pmd_is_valid_dev(uint8_t dev_id) +{ + struct rte_cryptodev *dev = NULL; + + if (dev_id >= rte_cryptodev_globals->nb_devs) + return 0; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + if (dev->attached != RTE_CRYPTODEV_ATTACHED) + return 0; + else + return 1; +} + + +int +rte_cryptodev_get_dev_id(const char *name) +{ + unsigned i; + + if (name == NULL) + return -1; + + for (i = 0; i < rte_cryptodev_globals->nb_devs; i++) + if ((strcmp(rte_cryptodev_globals->devs[i].data->name, name) + == 0) && + (rte_cryptodev_globals->devs[i].attached == + RTE_CRYPTODEV_ATTACHED)) + return i; + + return -1; +} + +uint8_t +rte_cryptodev_count(void) +{ + return rte_cryptodev_globals->nb_devs; +} + +uint8_t +rte_cryptodev_device_count_by_driver(uint8_t driver_id) +{ + uint8_t i, dev_count = 0; + + for (i = 0; i < rte_cryptodev_globals->max_devs; i++) + if (rte_cryptodev_globals->devs[i].driver_id == driver_id && + rte_cryptodev_globals->devs[i].attached == + RTE_CRYPTODEV_ATTACHED) + dev_count++; + + return dev_count; +} + +uint8_t +rte_cryptodev_devices_get(const char *driver_name, uint8_t *devices, + uint8_t nb_devices) +{ + uint8_t i, count = 0; + struct rte_cryptodev *devs = rte_cryptodev_globals->devs; + uint8_t max_devs = rte_cryptodev_globals->max_devs; + + for (i = 0; i < max_devs && count < nb_devices; i++) { + + if (devs[i].attached == RTE_CRYPTODEV_ATTACHED) { + int cmp; + + cmp = strncmp(devs[i].device->driver->name, + driver_name, + strlen(driver_name)); + + if (cmp == 0) + devices[count++] = devs[i].data->dev_id; + } + } + + return count; +} + +void * +rte_cryptodev_get_sec_ctx(uint8_t dev_id) +{ + if (rte_crypto_devices[dev_id].feature_flags & + RTE_CRYPTODEV_FF_SECURITY) + return rte_crypto_devices[dev_id].security_ctx; + + return NULL; +} + +int +rte_cryptodev_socket_id(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) + return -1; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + return dev->data->socket_id; +} + +static inline int +rte_cryptodev_data_alloc(uint8_t dev_id, struct rte_cryptodev_data **data, + int socket_id) +{ + char mz_name[RTE_CRYPTODEV_NAME_MAX_LEN]; + const struct rte_memzone *mz; + int n; + + /* generate memzone name */ + n = snprintf(mz_name, sizeof(mz_name), "rte_cryptodev_data_%u", dev_id); + if (n >= (int)sizeof(mz_name)) + return -EINVAL; + + mz = rte_memzone_lookup(mz_name); + if (mz == NULL) { + mz = rte_memzone_reserve(mz_name, + sizeof(struct rte_cryptodev_data), + socket_id, 0); + if (mz != NULL) { + memset(mz->addr, 0, sizeof(struct rte_cryptodev_data)); + } else if (rte_errno == EEXIST) { + /* + * The memzone was created by another process between the failed + * lookup and the subsequent reserve. So try to look it up again. + */ + mz = rte_memzone_lookup(mz_name); + } + } + + if (mz == NULL) + return -ENOMEM; + + *data = mz->addr; + + return 0; +} + +static uint8_t +rte_cryptodev_find_free_device_index(void) +{ + uint8_t dev_id; + + for (dev_id = 0; dev_id < RTE_CRYPTO_MAX_DEVS; dev_id++) { + if (rte_crypto_devices[dev_id].attached == + RTE_CRYPTODEV_DETACHED) + return dev_id; + } + return RTE_CRYPTO_MAX_DEVS; +} + +struct rte_cryptodev * +rte_cryptodev_pmd_allocate(const char *name, int socket_id) +{ + struct rte_cryptodev *cryptodev; + uint8_t dev_id; + + if (rte_cryptodev_pmd_get_named_dev(name) != NULL) { + CDEV_LOG_ERR("Crypto device with name %s already " + "allocated!", name); + return NULL; + } + + dev_id = rte_cryptodev_find_free_device_index(); + if (dev_id == RTE_CRYPTO_MAX_DEVS) { + CDEV_LOG_ERR("Reached maximum number of crypto devices"); + return NULL; + } + + cryptodev = rte_cryptodev_pmd_get_dev(dev_id); + + if (cryptodev->data == NULL) { + struct rte_cryptodev_data *cryptodev_data = + cryptodev_globals.data[dev_id]; + + int retval = rte_cryptodev_data_alloc(dev_id, &cryptodev_data, + socket_id); + + if (retval < 0 || cryptodev_data == NULL) + return NULL; + + cryptodev->data = cryptodev_data; + + snprintf(cryptodev->data->name, RTE_CRYPTODEV_NAME_MAX_LEN, + "%s", name); + + cryptodev->data->dev_id = dev_id; + cryptodev->data->socket_id = socket_id; + cryptodev->data->dev_started = 0; + + /* init user callbacks */ + TAILQ_INIT(&(cryptodev->link_intr_cbs)); + + cryptodev->attached = RTE_CRYPTODEV_ATTACHED; + + cryptodev_globals.nb_devs++; + } + + return cryptodev; +} + +int +rte_cryptodev_pmd_release_device(struct rte_cryptodev *cryptodev) +{ + int ret; + + if (cryptodev == NULL) + return -EINVAL; + + /* Close device only if device operations have been set */ + if (cryptodev->dev_ops) { + ret = rte_cryptodev_close(cryptodev->data->dev_id); + if (ret < 0) + return ret; + } + + cryptodev->attached = RTE_CRYPTODEV_DETACHED; + cryptodev_globals.nb_devs--; + return 0; +} + +uint16_t +rte_cryptodev_queue_pair_count(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + + dev = &rte_crypto_devices[dev_id]; + return dev->data->nb_queue_pairs; +} + +static int +rte_cryptodev_queue_pairs_config(struct rte_cryptodev *dev, uint16_t nb_qpairs, + int socket_id) +{ + struct rte_cryptodev_info dev_info; + void **qp; + unsigned i; + + if ((dev == NULL) || (nb_qpairs < 1)) { + CDEV_LOG_ERR("invalid param: dev %p, nb_queues %u", + dev, nb_qpairs); + return -EINVAL; + } + + CDEV_LOG_DEBUG("Setup %d queues pairs on device %u", + nb_qpairs, dev->data->dev_id); + + memset(&dev_info, 0, sizeof(struct rte_cryptodev_info)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + (*dev->dev_ops->dev_infos_get)(dev, &dev_info); + + if (nb_qpairs > (dev_info.max_nb_queue_pairs)) { + CDEV_LOG_ERR("Invalid num queue_pairs (%u) for dev %u", + nb_qpairs, dev->data->dev_id); + return -EINVAL; + } + + if (dev->data->queue_pairs == NULL) { /* first time configuration */ + dev->data->queue_pairs = rte_zmalloc_socket( + "cryptodev->queue_pairs", + sizeof(dev->data->queue_pairs[0]) * nb_qpairs, + RTE_CACHE_LINE_SIZE, socket_id); + + if (dev->data->queue_pairs == NULL) { + dev->data->nb_queue_pairs = 0; + CDEV_LOG_ERR("failed to get memory for qp meta data, " + "nb_queues %u", + nb_qpairs); + return -(ENOMEM); + } + } else { /* re-configure */ + int ret; + uint16_t old_nb_queues = dev->data->nb_queue_pairs; + + qp = dev->data->queue_pairs; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_release, + -ENOTSUP); + + for (i = nb_qpairs; i < old_nb_queues; i++) { + ret = (*dev->dev_ops->queue_pair_release)(dev, i); + if (ret < 0) + return ret; + } + + qp = rte_realloc(qp, sizeof(qp[0]) * nb_qpairs, + RTE_CACHE_LINE_SIZE); + if (qp == NULL) { + CDEV_LOG_ERR("failed to realloc qp meta data," + " nb_queues %u", nb_qpairs); + return -(ENOMEM); + } + + if (nb_qpairs > old_nb_queues) { + uint16_t new_qs = nb_qpairs - old_nb_queues; + + memset(qp + old_nb_queues, 0, + sizeof(qp[0]) * new_qs); + } + + dev->data->queue_pairs = qp; + + } + dev->data->nb_queue_pairs = nb_qpairs; + return 0; +} + +int +rte_cryptodev_configure(uint8_t dev_id, struct rte_cryptodev_config *config) +{ + struct rte_cryptodev *dev; + int diag; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_crypto_devices[dev_id]; + + if (dev->data->dev_started) { + CDEV_LOG_ERR( + "device %d must be stopped to allow configuration", dev_id); + return -EBUSY; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP); + + /* Setup new number of queue pairs and reconfigure device. */ + diag = rte_cryptodev_queue_pairs_config(dev, config->nb_queue_pairs, + config->socket_id); + if (diag != 0) { + CDEV_LOG_ERR("dev%d rte_crypto_dev_queue_pairs_config = %d", + dev_id, diag); + return diag; + } + + return (*dev->dev_ops->dev_configure)(dev, config); +} + + +int +rte_cryptodev_start(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + int diag; + + CDEV_LOG_DEBUG("Start dev_id=%" PRIu8, dev_id); + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_crypto_devices[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_start, -ENOTSUP); + + if (dev->data->dev_started != 0) { + CDEV_LOG_ERR("Device with dev_id=%" PRIu8 " already started", + dev_id); + return 0; + } + + diag = (*dev->dev_ops->dev_start)(dev); + if (diag == 0) + dev->data->dev_started = 1; + else + return diag; + + return 0; +} + +void +rte_cryptodev_stop(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return; + } + + dev = &rte_crypto_devices[dev_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_stop); + + if (dev->data->dev_started == 0) { + CDEV_LOG_ERR("Device with dev_id=%" PRIu8 " already stopped", + dev_id); + return; + } + + (*dev->dev_ops->dev_stop)(dev); + dev->data->dev_started = 0; +} + +int +rte_cryptodev_close(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + int retval; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return -1; + } + + dev = &rte_crypto_devices[dev_id]; + + /* Device must be stopped before it can be closed */ + if (dev->data->dev_started == 1) { + CDEV_LOG_ERR("Device %u must be stopped before closing", + dev_id); + return -EBUSY; + } + + /* We can't close the device if there are outstanding sessions in use */ + if (dev->data->session_pool != NULL) { + if (!rte_mempool_full(dev->data->session_pool)) { + CDEV_LOG_ERR("dev_id=%u close failed, session mempool " + "has sessions still in use, free " + "all sessions before calling close", + (unsigned)dev_id); + return -EBUSY; + } + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP); + retval = (*dev->dev_ops->dev_close)(dev); + + if (retval < 0) + return retval; + + return 0; +} + +int +rte_cryptodev_queue_pair_setup(uint8_t dev_id, uint16_t queue_pair_id, + const struct rte_cryptodev_qp_conf *qp_conf, int socket_id, + struct rte_mempool *session_pool) + +{ + struct rte_cryptodev *dev; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_crypto_devices[dev_id]; + if (queue_pair_id >= dev->data->nb_queue_pairs) { + CDEV_LOG_ERR("Invalid queue_pair_id=%d", queue_pair_id); + return -EINVAL; + } + + if (dev->data->dev_started) { + CDEV_LOG_ERR( + "device %d must be stopped to allow configuration", dev_id); + return -EBUSY; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_pair_setup, -ENOTSUP); + + return (*dev->dev_ops->queue_pair_setup)(dev, queue_pair_id, qp_conf, + socket_id, session_pool); +} + + +int +rte_cryptodev_stats_get(uint8_t dev_id, struct rte_cryptodev_stats *stats) +{ + struct rte_cryptodev *dev; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%d", dev_id); + return -ENODEV; + } + + if (stats == NULL) { + CDEV_LOG_ERR("Invalid stats ptr"); + return -EINVAL; + } + + dev = &rte_crypto_devices[dev_id]; + memset(stats, 0, sizeof(*stats)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP); + (*dev->dev_ops->stats_get)(dev, stats); + return 0; +} + +void +rte_cryptodev_stats_reset(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return; + } + + dev = &rte_crypto_devices[dev_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->stats_reset); + (*dev->dev_ops->stats_reset)(dev); +} + + +void +rte_cryptodev_info_get(uint8_t dev_id, struct rte_cryptodev_info *dev_info) +{ + struct rte_cryptodev *dev; + + if (dev_id >= cryptodev_globals.nb_devs) { + CDEV_LOG_ERR("Invalid dev_id=%d", dev_id); + return; + } + + dev = &rte_crypto_devices[dev_id]; + + memset(dev_info, 0, sizeof(struct rte_cryptodev_info)); + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get); + (*dev->dev_ops->dev_infos_get)(dev, dev_info); + + dev_info->driver_name = dev->device->driver->name; + dev_info->device = dev->device; +} + + +int +rte_cryptodev_callback_register(uint8_t dev_id, + enum rte_cryptodev_event_type event, + rte_cryptodev_cb_fn cb_fn, void *cb_arg) +{ + struct rte_cryptodev *dev; + struct rte_cryptodev_callback *user_cb; + + if (!cb_fn) + return -EINVAL; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_crypto_devices[dev_id]; + rte_spinlock_lock(&rte_cryptodev_cb_lock); + + TAILQ_FOREACH(user_cb, &(dev->link_intr_cbs), next) { + if (user_cb->cb_fn == cb_fn && + user_cb->cb_arg == cb_arg && + user_cb->event == event) { + break; + } + } + + /* create a new callback. */ + if (user_cb == NULL) { + user_cb = rte_zmalloc("INTR_USER_CALLBACK", + sizeof(struct rte_cryptodev_callback), 0); + if (user_cb != NULL) { + user_cb->cb_fn = cb_fn; + user_cb->cb_arg = cb_arg; + user_cb->event = event; + TAILQ_INSERT_TAIL(&(dev->link_intr_cbs), user_cb, next); + } + } + + rte_spinlock_unlock(&rte_cryptodev_cb_lock); + return (user_cb == NULL) ? -ENOMEM : 0; +} + +int +rte_cryptodev_callback_unregister(uint8_t dev_id, + enum rte_cryptodev_event_type event, + rte_cryptodev_cb_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_cryptodev *dev; + struct rte_cryptodev_callback *cb, *next; + + if (!cb_fn) + return -EINVAL; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { + CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); + return -EINVAL; + } + + dev = &rte_crypto_devices[dev_id]; + rte_spinlock_lock(&rte_cryptodev_cb_lock); + + ret = 0; + for (cb = TAILQ_FIRST(&dev->link_intr_cbs); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn != cb_fn || cb->event != event || + (cb->cb_arg != (void *)-1 && + cb->cb_arg != cb_arg)) + continue; + + /* + * if this callback is not executing right now, + * then remove it. + */ + if (cb->active == 0) { + TAILQ_REMOVE(&(dev->link_intr_cbs), cb, next); + rte_free(cb); + } else { + ret = -EAGAIN; + } + } + + rte_spinlock_unlock(&rte_cryptodev_cb_lock); + return ret; +} + +void +rte_cryptodev_pmd_callback_process(struct rte_cryptodev *dev, + enum rte_cryptodev_event_type event) +{ + struct rte_cryptodev_callback *cb_lst; + struct rte_cryptodev_callback dev_cb; + + rte_spinlock_lock(&rte_cryptodev_cb_lock); + TAILQ_FOREACH(cb_lst, &(dev->link_intr_cbs), next) { + if (cb_lst->cb_fn == NULL || cb_lst->event != event) + continue; + dev_cb = *cb_lst; + cb_lst->active = 1; + rte_spinlock_unlock(&rte_cryptodev_cb_lock); + dev_cb.cb_fn(dev->data->dev_id, dev_cb.event, + dev_cb.cb_arg); + rte_spinlock_lock(&rte_cryptodev_cb_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&rte_cryptodev_cb_lock); +} + + +int +rte_cryptodev_sym_session_init(uint8_t dev_id, + struct rte_cryptodev_sym_session *sess, + struct rte_crypto_sym_xform *xforms, + struct rte_mempool *mp) +{ + struct rte_cryptodev *dev; + uint8_t index; + int ret; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (sess == NULL || xforms == NULL || dev == NULL) + return -EINVAL; + + index = dev->driver_id; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->sym_session_configure, -ENOTSUP); + + if (sess->sess_private_data[index] == NULL) { + ret = dev->dev_ops->sym_session_configure(dev, xforms, + sess, mp); + if (ret < 0) { + CDEV_LOG_ERR( + "dev_id %d failed to configure session details", + dev_id); + return ret; + } + } + + return 0; +} + +int __rte_experimental +rte_cryptodev_asym_session_init(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess, + struct rte_crypto_asym_xform *xforms, + struct rte_mempool *mp) +{ + struct rte_cryptodev *dev; + uint8_t index; + int ret; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (sess == NULL || xforms == NULL || dev == NULL) + return -EINVAL; + + index = dev->driver_id; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->asym_session_configure, + -ENOTSUP); + + if (sess->sess_private_data[index] == NULL) { + ret = dev->dev_ops->asym_session_configure(dev, + xforms, + sess, mp); + if (ret < 0) { + CDEV_LOG_ERR( + "dev_id %d failed to configure session details", + dev_id); + return ret; + } + } + + return 0; +} + +struct rte_cryptodev_sym_session * +rte_cryptodev_sym_session_create(struct rte_mempool *mp) +{ + struct rte_cryptodev_sym_session *sess; + + /* Allocate a session structure from the session pool */ + if (rte_mempool_get(mp, (void **)&sess)) { + CDEV_LOG_ERR("couldn't get object from session mempool"); + return NULL; + } + + /* Clear device session pointer. + * Include the flag indicating presence of user data + */ + memset(sess, 0, (sizeof(void *) * nb_drivers) + sizeof(uint8_t)); + + return sess; +} + +struct rte_cryptodev_asym_session * __rte_experimental +rte_cryptodev_asym_session_create(struct rte_mempool *mp) +{ + struct rte_cryptodev_asym_session *sess; + + /* Allocate a session structure from the session pool */ + if (rte_mempool_get(mp, (void **)&sess)) { + CDEV_LOG_ERR("couldn't get object from session mempool"); + return NULL; + } + + /* Clear device session pointer. + * Include the flag indicating presence of private data + */ + memset(sess, 0, (sizeof(void *) * nb_drivers) + sizeof(uint8_t)); + + return sess; +} + +int +rte_cryptodev_sym_session_clear(uint8_t dev_id, + struct rte_cryptodev_sym_session *sess) +{ + struct rte_cryptodev *dev; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (dev == NULL || sess == NULL) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->sym_session_clear, -ENOTSUP); + + dev->dev_ops->sym_session_clear(dev, sess); + + return 0; +} + +int __rte_experimental +rte_cryptodev_asym_session_clear(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess) +{ + struct rte_cryptodev *dev; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (dev == NULL || sess == NULL) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->asym_session_clear, -ENOTSUP); + + dev->dev_ops->asym_session_clear(dev, sess); + + return 0; +} + +int +rte_cryptodev_sym_session_free(struct rte_cryptodev_sym_session *sess) +{ + uint8_t i; + void *sess_priv; + struct rte_mempool *sess_mp; + + if (sess == NULL) + return -EINVAL; + + /* Check that all device private data has been freed */ + for (i = 0; i < nb_drivers; i++) { + sess_priv = get_sym_session_private_data(sess, i); + if (sess_priv != NULL) + return -EBUSY; + } + + /* Return session to mempool */ + sess_mp = rte_mempool_from_obj(sess); + rte_mempool_put(sess_mp, sess); + + return 0; +} + +int __rte_experimental +rte_cryptodev_asym_session_free(struct rte_cryptodev_asym_session *sess) +{ + uint8_t i; + void *sess_priv; + struct rte_mempool *sess_mp; + + if (sess == NULL) + return -EINVAL; + + /* Check that all device private data has been freed */ + for (i = 0; i < nb_drivers; i++) { + sess_priv = get_asym_session_private_data(sess, i); + if (sess_priv != NULL) + return -EBUSY; + } + + /* Return session to mempool */ + sess_mp = rte_mempool_from_obj(sess); + rte_mempool_put(sess_mp, sess); + + return 0; +} + + +unsigned int +rte_cryptodev_sym_get_header_session_size(void) +{ + /* + * Header contains pointers to the private data + * of all registered drivers, and a flag which + * indicates presence of user data + */ + return ((sizeof(void *) * nb_drivers) + sizeof(uint8_t)); +} + +unsigned int __rte_experimental +rte_cryptodev_asym_get_header_session_size(void) +{ + /* + * Header contains pointers to the private data + * of all registered drivers, and a flag which + * indicates presence of private data + */ + return ((sizeof(void *) * nb_drivers) + sizeof(uint8_t)); +} + +unsigned int +rte_cryptodev_sym_get_private_session_size(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + unsigned int header_size = sizeof(void *) * nb_drivers; + unsigned int priv_sess_size; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) + return 0; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (*dev->dev_ops->sym_session_get_size == NULL) + return 0; + + priv_sess_size = (*dev->dev_ops->sym_session_get_size)(dev); + + /* + * If size is less than session header size, + * return the latter, as this guarantees that + * sessionless operations will work + */ + if (priv_sess_size < header_size) + return header_size; + + return priv_sess_size; + +} + +unsigned int __rte_experimental +rte_cryptodev_asym_get_private_session_size(uint8_t dev_id) +{ + struct rte_cryptodev *dev; + unsigned int header_size = sizeof(void *) * nb_drivers; + unsigned int priv_sess_size; + + if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) + return 0; + + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (*dev->dev_ops->asym_session_get_size == NULL) + return 0; + + priv_sess_size = (*dev->dev_ops->asym_session_get_size)(dev); + if (priv_sess_size < header_size) + return header_size; + + return priv_sess_size; + +} + +int __rte_experimental +rte_cryptodev_sym_session_set_user_data( + struct rte_cryptodev_sym_session *sess, + void *data, + uint16_t size) +{ + uint16_t off_set = sizeof(void *) * nb_drivers; + uint8_t *user_data_present = (uint8_t *)sess + off_set; + + if (sess == NULL) + return -EINVAL; + + *user_data_present = 1; + off_set += sizeof(uint8_t); + rte_memcpy((uint8_t *)sess + off_set, data, size); + return 0; +} + +void * __rte_experimental +rte_cryptodev_sym_session_get_user_data( + struct rte_cryptodev_sym_session *sess) +{ + uint16_t off_set = sizeof(void *) * nb_drivers; + uint8_t *user_data_present = (uint8_t *)sess + off_set; + + if (sess == NULL || !*user_data_present) + return NULL; + + off_set += sizeof(uint8_t); + return (uint8_t *)sess + off_set; +} + +/** Initialise rte_crypto_op mempool element */ +static void +rte_crypto_op_init(struct rte_mempool *mempool, + void *opaque_arg, + void *_op_data, + __rte_unused unsigned i) +{ + struct rte_crypto_op *op = _op_data; + enum rte_crypto_op_type type = *(enum rte_crypto_op_type *)opaque_arg; + + memset(_op_data, 0, mempool->elt_size); + + __rte_crypto_op_reset(op, type); + + op->phys_addr = rte_mem_virt2iova(_op_data); + op->mempool = mempool; +} + + +struct rte_mempool * +rte_crypto_op_pool_create(const char *name, enum rte_crypto_op_type type, + unsigned nb_elts, unsigned cache_size, uint16_t priv_size, + int socket_id) +{ + struct rte_crypto_op_pool_private *priv; + + unsigned elt_size = sizeof(struct rte_crypto_op) + + priv_size; + + if (type == RTE_CRYPTO_OP_TYPE_SYMMETRIC) { + elt_size += sizeof(struct rte_crypto_sym_op); + } else if (type == RTE_CRYPTO_OP_TYPE_ASYMMETRIC) { + elt_size += sizeof(struct rte_crypto_asym_op); + } else { + CDEV_LOG_ERR("Invalid op_type\n"); + return NULL; + } + + /* lookup mempool in case already allocated */ + struct rte_mempool *mp = rte_mempool_lookup(name); + + if (mp != NULL) { + priv = (struct rte_crypto_op_pool_private *) + rte_mempool_get_priv(mp); + + if (mp->elt_size != elt_size || + mp->cache_size < cache_size || + mp->size < nb_elts || + priv->priv_size < priv_size) { + mp = NULL; + CDEV_LOG_ERR("Mempool %s already exists but with " + "incompatible parameters", name); + return NULL; + } + return mp; + } + + mp = rte_mempool_create( + name, + nb_elts, + elt_size, + cache_size, + sizeof(struct rte_crypto_op_pool_private), + NULL, + NULL, + rte_crypto_op_init, + &type, + socket_id, + 0); + + if (mp == NULL) { + CDEV_LOG_ERR("Failed to create mempool %s", name); + return NULL; + } + + priv = (struct rte_crypto_op_pool_private *) + rte_mempool_get_priv(mp); + + priv->priv_size = priv_size; + priv->type = type; + + return mp; +} + +int +rte_cryptodev_pmd_create_dev_name(char *name, const char *dev_name_prefix) +{ + struct rte_cryptodev *dev = NULL; + uint32_t i = 0; + + if (name == NULL) + return -EINVAL; + + for (i = 0; i < RTE_CRYPTO_MAX_DEVS; i++) { + int ret = snprintf(name, RTE_CRYPTODEV_NAME_MAX_LEN, + "%s_%u", dev_name_prefix, i); + + if (ret < 0) + return ret; + + dev = rte_cryptodev_pmd_get_named_dev(name); + if (!dev) + return 0; + } + + return -1; +} + +TAILQ_HEAD(cryptodev_driver_list, cryptodev_driver); + +static struct cryptodev_driver_list cryptodev_driver_list = + TAILQ_HEAD_INITIALIZER(cryptodev_driver_list); + +int +rte_cryptodev_driver_id_get(const char *name) +{ + struct cryptodev_driver *driver; + const char *driver_name; + + if (name == NULL) { + RTE_LOG(DEBUG, CRYPTODEV, "name pointer NULL"); + return -1; + } + + TAILQ_FOREACH(driver, &cryptodev_driver_list, next) { + driver_name = driver->driver->name; + if (strncmp(driver_name, name, strlen(driver_name)) == 0) + return driver->id; + } + return -1; +} + +const char * +rte_cryptodev_name_get(uint8_t dev_id) +{ + struct rte_cryptodev *dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (dev == NULL) + return NULL; + + return dev->data->name; +} + +const char * +rte_cryptodev_driver_name_get(uint8_t driver_id) +{ + struct cryptodev_driver *driver; + + TAILQ_FOREACH(driver, &cryptodev_driver_list, next) + if (driver->id == driver_id) + return driver->driver->name; + return NULL; +} + +uint8_t +rte_cryptodev_allocate_driver(struct cryptodev_driver *crypto_drv, + const struct rte_driver *drv) +{ + crypto_drv->driver = drv; + crypto_drv->id = nb_drivers; + + TAILQ_INSERT_TAIL(&cryptodev_driver_list, crypto_drv, next); + + return nb_drivers++; +} diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.h b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.h new file mode 100644 index 00000000..4099823f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev.h @@ -0,0 +1,1197 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2017 Intel Corporation. + */ + +#ifndef _RTE_CRYPTODEV_H_ +#define _RTE_CRYPTODEV_H_ + +/** + * @file rte_cryptodev.h + * + * RTE Cryptographic Device APIs + * + * Defines RTE Crypto Device APIs for the provisioning of cipher and + * authentication operations. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "rte_kvargs.h" +#include "rte_crypto.h" +#include "rte_dev.h" +#include +#include + +extern const char **rte_cyptodev_names; + +/* Logging Macros */ + +#define CDEV_LOG_ERR(...) \ + RTE_LOG(ERR, CRYPTODEV, \ + RTE_FMT("%s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,))) + +#define CDEV_LOG_INFO(...) \ + RTE_LOG(INFO, CRYPTODEV, \ + RTE_FMT(RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + RTE_FMT_TAIL(__VA_ARGS__,))) + +#define CDEV_LOG_DEBUG(...) \ + RTE_LOG(DEBUG, CRYPTODEV, \ + RTE_FMT("%s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,))) + +#define CDEV_PMD_TRACE(...) \ + RTE_LOG(DEBUG, CRYPTODEV, \ + RTE_FMT("[%s] %s: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + dev, __func__, RTE_FMT_TAIL(__VA_ARGS__,))) + +/** + * A macro that points to an offset from the start + * of the crypto operation structure (rte_crypto_op) + * + * The returned pointer is cast to type t. + * + * @param c + * The crypto operation. + * @param o + * The offset from the start of the crypto operation. + * @param t + * The type to cast the result into. + */ +#define rte_crypto_op_ctod_offset(c, t, o) \ + ((t)((char *)(c) + (o))) + +/** + * A macro that returns the physical address that points + * to an offset from the start of the crypto operation + * (rte_crypto_op) + * + * @param c + * The crypto operation. + * @param o + * The offset from the start of the crypto operation + * to calculate address from. + */ +#define rte_crypto_op_ctophys_offset(c, o) \ + (rte_iova_t)((c)->phys_addr + (o)) + +/** + * Crypto parameters range description + */ +struct rte_crypto_param_range { + uint16_t min; /**< minimum size */ + uint16_t max; /**< maximum size */ + uint16_t increment; + /**< if a range of sizes are supported, + * this parameter is used to indicate + * increments in byte size that are supported + * between the minimum and maximum + */ +}; + +/** + * Symmetric Crypto Capability + */ +struct rte_cryptodev_symmetric_capability { + enum rte_crypto_sym_xform_type xform_type; + /**< Transform type : Authentication / Cipher / AEAD */ + RTE_STD_C11 + union { + struct { + enum rte_crypto_auth_algorithm algo; + /**< authentication algorithm */ + uint16_t block_size; + /**< algorithm block size */ + struct rte_crypto_param_range key_size; + /**< auth key size range */ + struct rte_crypto_param_range digest_size; + /**< digest size range */ + struct rte_crypto_param_range aad_size; + /**< Additional authentication data size range */ + struct rte_crypto_param_range iv_size; + /**< Initialisation vector data size range */ + } auth; + /**< Symmetric Authentication transform capabilities */ + struct { + enum rte_crypto_cipher_algorithm algo; + /**< cipher algorithm */ + uint16_t block_size; + /**< algorithm block size */ + struct rte_crypto_param_range key_size; + /**< cipher key size range */ + struct rte_crypto_param_range iv_size; + /**< Initialisation vector data size range */ + } cipher; + /**< Symmetric Cipher transform capabilities */ + struct { + enum rte_crypto_aead_algorithm algo; + /**< AEAD algorithm */ + uint16_t block_size; + /**< algorithm block size */ + struct rte_crypto_param_range key_size; + /**< AEAD key size range */ + struct rte_crypto_param_range digest_size; + /**< digest size range */ + struct rte_crypto_param_range aad_size; + /**< Additional authentication data size range */ + struct rte_crypto_param_range iv_size; + /**< Initialisation vector data size range */ + } aead; + }; +}; + +/** + * Asymmetric Xform Crypto Capability + * + */ +struct rte_cryptodev_asymmetric_xform_capability { + enum rte_crypto_asym_xform_type xform_type; + /**< Transform type: RSA/MODEXP/DH/DSA/MODINV */ + + uint32_t op_types; + /**< bitmask for supported rte_crypto_asym_op_type */ + + __extension__ + union { + struct rte_crypto_param_range modlen; + /**< Range of modulus length supported by modulus based xform. + * Value 0 mean implementation default + */ + }; +}; + +/** + * Asymmetric Crypto Capability + * + */ +struct rte_cryptodev_asymmetric_capability { + struct rte_cryptodev_asymmetric_xform_capability xform_capa; +}; + + +/** Structure used to capture a capability of a crypto device */ +struct rte_cryptodev_capabilities { + enum rte_crypto_op_type op; + /**< Operation type */ + + RTE_STD_C11 + union { + struct rte_cryptodev_symmetric_capability sym; + /**< Symmetric operation capability parameters */ + struct rte_cryptodev_asymmetric_capability asym; + /**< Asymmetric operation capability parameters */ + }; +}; + +/** Structure used to describe crypto algorithms */ +struct rte_cryptodev_sym_capability_idx { + enum rte_crypto_sym_xform_type type; + union { + enum rte_crypto_cipher_algorithm cipher; + enum rte_crypto_auth_algorithm auth; + enum rte_crypto_aead_algorithm aead; + } algo; +}; + +/** + * Structure used to describe asymmetric crypto xforms + * Each xform maps to one asym algorithm. + * + */ +struct rte_cryptodev_asym_capability_idx { + enum rte_crypto_asym_xform_type type; + /**< Asymmetric xform (algo) type */ +}; + +/** + * Provide capabilities available for defined device and algorithm + * + * @param dev_id The identifier of the device. + * @param idx Description of crypto algorithms. + * + * @return + * - Return description of the symmetric crypto capability if exist. + * - Return NULL if the capability not exist. + */ +const struct rte_cryptodev_symmetric_capability * +rte_cryptodev_sym_capability_get(uint8_t dev_id, + const struct rte_cryptodev_sym_capability_idx *idx); + +/** + * Provide capabilities available for defined device and xform + * + * @param dev_id The identifier of the device. + * @param idx Description of asym crypto xform. + * + * @return + * - Return description of the asymmetric crypto capability if exist. + * - Return NULL if the capability not exist. + */ +const struct rte_cryptodev_asymmetric_xform_capability * __rte_experimental +rte_cryptodev_asym_capability_get(uint8_t dev_id, + const struct rte_cryptodev_asym_capability_idx *idx); + +/** + * Check if key size and initial vector are supported + * in crypto cipher capability + * + * @param capability Description of the symmetric crypto capability. + * @param key_size Cipher key size. + * @param iv_size Cipher initial vector size. + * + * @return + * - Return 0 if the parameters are in range of the capability. + * - Return -1 if the parameters are out of range of the capability. + */ +int +rte_cryptodev_sym_capability_check_cipher( + const struct rte_cryptodev_symmetric_capability *capability, + uint16_t key_size, uint16_t iv_size); + +/** + * Check if key size and initial vector are supported + * in crypto auth capability + * + * @param capability Description of the symmetric crypto capability. + * @param key_size Auth key size. + * @param digest_size Auth digest size. + * @param iv_size Auth initial vector size. + * + * @return + * - Return 0 if the parameters are in range of the capability. + * - Return -1 if the parameters are out of range of the capability. + */ +int +rte_cryptodev_sym_capability_check_auth( + const struct rte_cryptodev_symmetric_capability *capability, + uint16_t key_size, uint16_t digest_size, uint16_t iv_size); + +/** + * Check if key, digest, AAD and initial vector sizes are supported + * in crypto AEAD capability + * + * @param capability Description of the symmetric crypto capability. + * @param key_size AEAD key size. + * @param digest_size AEAD digest size. + * @param aad_size AEAD AAD size. + * @param iv_size AEAD IV size. + * + * @return + * - Return 0 if the parameters are in range of the capability. + * - Return -1 if the parameters are out of range of the capability. + */ +int +rte_cryptodev_sym_capability_check_aead( + const struct rte_cryptodev_symmetric_capability *capability, + uint16_t key_size, uint16_t digest_size, uint16_t aad_size, + uint16_t iv_size); + +/** + * Check if op type is supported + * + * @param capability Description of the asymmetric crypto capability. + * @param op_type op type + * + * @return + * - Return 1 if the op type is supported + * - Return 0 if unsupported + */ +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_optype( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + enum rte_crypto_asym_op_type op_type); + +/** + * Check if modulus length is in supported range + * + * @param capability Description of the asymmetric crypto capability. + * @param modlen modulus length. + * + * @return + * - Return 0 if the parameters are in range of the capability. + * - Return -1 if the parameters are out of range of the capability. + */ +int __rte_experimental +rte_cryptodev_asym_xform_capability_check_modlen( + const struct rte_cryptodev_asymmetric_xform_capability *capability, + uint16_t modlen); + +/** + * Provide the cipher algorithm enum, given an algorithm string + * + * @param algo_enum A pointer to the cipher algorithm + * enum to be filled + * @param algo_string Authentication algo string + * + * @return + * - Return -1 if string is not valid + * - Return 0 is the string is valid + */ +int +rte_cryptodev_get_cipher_algo_enum(enum rte_crypto_cipher_algorithm *algo_enum, + const char *algo_string); + +/** + * Provide the authentication algorithm enum, given an algorithm string + * + * @param algo_enum A pointer to the authentication algorithm + * enum to be filled + * @param algo_string Authentication algo string + * + * @return + * - Return -1 if string is not valid + * - Return 0 is the string is valid + */ +int +rte_cryptodev_get_auth_algo_enum(enum rte_crypto_auth_algorithm *algo_enum, + const char *algo_string); + +/** + * Provide the AEAD algorithm enum, given an algorithm string + * + * @param algo_enum A pointer to the AEAD algorithm + * enum to be filled + * @param algo_string AEAD algorithm string + * + * @return + * - Return -1 if string is not valid + * - Return 0 is the string is valid + */ +int +rte_cryptodev_get_aead_algo_enum(enum rte_crypto_aead_algorithm *algo_enum, + const char *algo_string); + +/** + * Provide the Asymmetric xform enum, given an xform string + * + * @param xform_enum A pointer to the xform type + * enum to be filled + * @param xform_string xform string + * + * @return + * - Return -1 if string is not valid + * - Return 0 if the string is valid + */ +int __rte_experimental +rte_cryptodev_asym_get_xform_enum(enum rte_crypto_asym_xform_type *xform_enum, + const char *xform_string); + + +/** Macro used at end of crypto PMD list */ +#define RTE_CRYPTODEV_END_OF_CAPABILITIES_LIST() \ + { RTE_CRYPTO_OP_TYPE_UNDEFINED } + + +/** + * Crypto device supported feature flags + * + * Note: + * New features flags should be added to the end of the list + * + * Keep these flags synchronised with rte_cryptodev_get_feature_name() + */ +#define RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO (1ULL << 0) +/**< Symmetric crypto operations are supported */ +#define RTE_CRYPTODEV_FF_ASYMMETRIC_CRYPTO (1ULL << 1) +/**< Asymmetric crypto operations are supported */ +#define RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING (1ULL << 2) +/**< Chaining symmetric crypto operations are supported */ +#define RTE_CRYPTODEV_FF_CPU_SSE (1ULL << 3) +/**< Utilises CPU SIMD SSE instructions */ +#define RTE_CRYPTODEV_FF_CPU_AVX (1ULL << 4) +/**< Utilises CPU SIMD AVX instructions */ +#define RTE_CRYPTODEV_FF_CPU_AVX2 (1ULL << 5) +/**< Utilises CPU SIMD AVX2 instructions */ +#define RTE_CRYPTODEV_FF_CPU_AESNI (1ULL << 6) +/**< Utilises CPU AES-NI instructions */ +#define RTE_CRYPTODEV_FF_HW_ACCELERATED (1ULL << 7) +/**< Operations are off-loaded to an + * external hardware accelerator + */ +#define RTE_CRYPTODEV_FF_CPU_AVX512 (1ULL << 8) +/**< Utilises CPU SIMD AVX512 instructions */ +#define RTE_CRYPTODEV_FF_IN_PLACE_SGL (1ULL << 9) +/**< In-place Scatter-gather (SGL) buffers, with multiple segments, + * are supported + */ +#define RTE_CRYPTODEV_FF_OOP_SGL_IN_SGL_OUT (1ULL << 10) +/**< Out-of-place Scatter-gather (SGL) buffers are + * supported in input and output + */ +#define RTE_CRYPTODEV_FF_OOP_SGL_IN_LB_OUT (1ULL << 11) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in input, combined with linear buffers (LB), with a + * single segment in output + */ +#define RTE_CRYPTODEV_FF_OOP_LB_IN_SGL_OUT (1ULL << 12) +/**< Out-of-place Scatter-gather (SGL) buffers are supported + * in output, combined with linear buffers (LB) in input + */ +#define RTE_CRYPTODEV_FF_OOP_LB_IN_LB_OUT (1ULL << 13) +/**< Out-of-place linear buffers (LB) are supported in input and output */ +#define RTE_CRYPTODEV_FF_CPU_NEON (1ULL << 14) +/**< Utilises CPU NEON instructions */ +#define RTE_CRYPTODEV_FF_CPU_ARM_CE (1ULL << 15) +/**< Utilises ARM CPU Cryptographic Extensions */ +#define RTE_CRYPTODEV_FF_SECURITY (1ULL << 16) +/**< Support Security Protocol Processing */ + + +/** + * Get the name of a crypto device feature flag + * + * @param flag The mask describing the flag. + * + * @return + * The name of this flag, or NULL if it's not a valid feature flag. + */ + +extern const char * +rte_cryptodev_get_feature_name(uint64_t flag); + +/** Crypto device information */ +struct rte_cryptodev_info { + const char *driver_name; /**< Driver name. */ + uint8_t driver_id; /**< Driver identifier */ + struct rte_device *device; /**< Generic device information. */ + + uint64_t feature_flags; + /**< Feature flags exposes HW/SW features for the given device */ + + const struct rte_cryptodev_capabilities *capabilities; + /**< Array of devices supported capabilities */ + + unsigned max_nb_queue_pairs; + /**< Maximum number of queues pairs supported by device. */ + + uint16_t min_mbuf_headroom_req; + /**< Minimum mbuf headroom required by device */ + + uint16_t min_mbuf_tailroom_req; + /**< Minimum mbuf tailroom required by device */ + + struct { + unsigned max_nb_sessions; + /**< Maximum number of sessions supported by device. + * If 0, the device does not have any limitation in + * number of sessions that can be used. + */ + } sym; +}; + +#define RTE_CRYPTODEV_DETACHED (0) +#define RTE_CRYPTODEV_ATTACHED (1) + +/** Definitions of Crypto device event types */ +enum rte_cryptodev_event_type { + RTE_CRYPTODEV_EVENT_UNKNOWN, /**< unknown event type */ + RTE_CRYPTODEV_EVENT_ERROR, /**< error interrupt event */ + RTE_CRYPTODEV_EVENT_MAX /**< max value of this enum */ +}; + +/** Crypto device queue pair configuration structure. */ +struct rte_cryptodev_qp_conf { + uint32_t nb_descriptors; /**< Number of descriptors per queue pair */ +}; + +/** + * Typedef for application callback function to be registered by application + * software for notification of device events + * + * @param dev_id Crypto device identifier + * @param event Crypto device event to register for notification of. + * @param cb_arg User specified parameter to be passed as to passed to + * users callback function. + */ +typedef void (*rte_cryptodev_cb_fn)(uint8_t dev_id, + enum rte_cryptodev_event_type event, void *cb_arg); + + +/** Crypto Device statistics */ +struct rte_cryptodev_stats { + uint64_t enqueued_count; + /**< Count of all operations enqueued */ + uint64_t dequeued_count; + /**< Count of all operations dequeued */ + + uint64_t enqueue_err_count; + /**< Total error count on operations enqueued */ + uint64_t dequeue_err_count; + /**< Total error count on operations dequeued */ +}; + +#define RTE_CRYPTODEV_NAME_MAX_LEN (64) +/**< Max length of name of crypto PMD */ + +/** + * Get the device identifier for the named crypto device. + * + * @param name device name to select the device structure. + * + * @return + * - Returns crypto device identifier on success. + * - Return -1 on failure to find named crypto device. + */ +extern int +rte_cryptodev_get_dev_id(const char *name); + +/** + * Get the crypto device name given a device identifier. + * + * @param dev_id + * The identifier of the device + * + * @return + * - Returns crypto device name. + * - Returns NULL if crypto device is not present. + */ +extern const char * +rte_cryptodev_name_get(uint8_t dev_id); + +/** + * Get the total number of crypto devices that have been successfully + * initialised. + * + * @return + * - The total number of usable crypto devices. + */ +extern uint8_t +rte_cryptodev_count(void); + +/** + * Get number of crypto device defined type. + * + * @param driver_id driver identifier. + * + * @return + * Returns number of crypto device. + */ +extern uint8_t +rte_cryptodev_device_count_by_driver(uint8_t driver_id); + +/** + * Get number and identifiers of attached crypto devices that + * use the same crypto driver. + * + * @param driver_name driver name. + * @param devices output devices identifiers. + * @param nb_devices maximal number of devices. + * + * @return + * Returns number of attached crypto device. + */ +uint8_t +rte_cryptodev_devices_get(const char *driver_name, uint8_t *devices, + uint8_t nb_devices); +/* + * Return the NUMA socket to which a device is connected + * + * @param dev_id + * The identifier of the device + * @return + * The NUMA socket id to which the device is connected or + * a default of zero if the socket could not be determined. + * -1 if returned is the dev_id value is out of range. + */ +extern int +rte_cryptodev_socket_id(uint8_t dev_id); + +/** Crypto device configuration structure */ +struct rte_cryptodev_config { + int socket_id; /**< Socket to allocate resources on */ + uint16_t nb_queue_pairs; + /**< Number of queue pairs to configure on device */ +}; + +/** + * Configure a device. + * + * This function must be invoked first before any other function in the + * API. This function can also be re-invoked when a device is in the + * stopped state. + * + * @param dev_id The identifier of the device to configure. + * @param config The crypto device configuration structure. + * + * @return + * - 0: Success, device configured. + * - <0: Error code returned by the driver configuration function. + */ +extern int +rte_cryptodev_configure(uint8_t dev_id, struct rte_cryptodev_config *config); + +/** + * Start an device. + * + * The device start step is the last one and consists of setting the configured + * offload features and in starting the transmit and the receive units of the + * device. + * On success, all basic functions exported by the API (link status, + * receive/transmit, and so on) can be invoked. + * + * @param dev_id + * The identifier of the device. + * @return + * - 0: Success, device started. + * - <0: Error code of the driver device start function. + */ +extern int +rte_cryptodev_start(uint8_t dev_id); + +/** + * Stop an device. The device can be restarted with a call to + * rte_cryptodev_start() + * + * @param dev_id The identifier of the device. + */ +extern void +rte_cryptodev_stop(uint8_t dev_id); + +/** + * Close an device. The device cannot be restarted! + * + * @param dev_id The identifier of the device. + * + * @return + * - 0 on successfully closing device + * - <0 on failure to close device + */ +extern int +rte_cryptodev_close(uint8_t dev_id); + +/** + * Allocate and set up a receive queue pair for a device. + * + * + * @param dev_id The identifier of the device. + * @param queue_pair_id The index of the queue pairs to set up. The + * value must be in the range [0, nb_queue_pair + * - 1] previously supplied to + * rte_cryptodev_configure(). + * @param qp_conf The pointer to the configuration data to be + * used for the queue pair. NULL value is + * allowed, in which case default configuration + * will be used. + * @param socket_id The *socket_id* argument is the socket + * identifier in case of NUMA. The value can be + * *SOCKET_ID_ANY* if there is no NUMA constraint + * for the DMA memory allocated for the receive + * queue pair. + * @param session_pool Pointer to device session mempool, used + * for session-less operations. + * + * @return + * - 0: Success, queue pair correctly set up. + * - <0: Queue pair configuration failed + */ +extern int +rte_cryptodev_queue_pair_setup(uint8_t dev_id, uint16_t queue_pair_id, + const struct rte_cryptodev_qp_conf *qp_conf, int socket_id, + struct rte_mempool *session_pool); + +/** + * Get the number of queue pairs on a specific crypto device + * + * @param dev_id Crypto device identifier. + * @return + * - The number of configured queue pairs. + */ +extern uint16_t +rte_cryptodev_queue_pair_count(uint8_t dev_id); + + +/** + * Retrieve the general I/O statistics of a device. + * + * @param dev_id The identifier of the device. + * @param stats A pointer to a structure of type + * *rte_cryptodev_stats* to be filled with the + * values of device counters. + * @return + * - Zero if successful. + * - Non-zero otherwise. + */ +extern int +rte_cryptodev_stats_get(uint8_t dev_id, struct rte_cryptodev_stats *stats); + +/** + * Reset the general I/O statistics of a device. + * + * @param dev_id The identifier of the device. + */ +extern void +rte_cryptodev_stats_reset(uint8_t dev_id); + +/** + * Retrieve the contextual information of a device. + * + * @param dev_id The identifier of the device. + * @param dev_info A pointer to a structure of type + * *rte_cryptodev_info* to be filled with the + * contextual information of the device. + * + * @note The capabilities field of dev_info is set to point to the first + * element of an array of struct rte_cryptodev_capabilities. The element after + * the last valid element has it's op field set to + * RTE_CRYPTO_OP_TYPE_UNDEFINED. + */ +extern void +rte_cryptodev_info_get(uint8_t dev_id, struct rte_cryptodev_info *dev_info); + + +/** + * Register a callback function for specific device id. + * + * @param dev_id Device id. + * @param event Event interested. + * @param cb_fn User supplied callback function to be called. + * @param cb_arg Pointer to the parameters for the registered + * callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +extern int +rte_cryptodev_callback_register(uint8_t dev_id, + enum rte_cryptodev_event_type event, + rte_cryptodev_cb_fn cb_fn, void *cb_arg); + +/** + * Unregister a callback function for specific device id. + * + * @param dev_id The device identifier. + * @param event Event interested. + * @param cb_fn User supplied callback function to be called. + * @param cb_arg Pointer to the parameters for the registered + * callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +extern int +rte_cryptodev_callback_unregister(uint8_t dev_id, + enum rte_cryptodev_event_type event, + rte_cryptodev_cb_fn cb_fn, void *cb_arg); + + +typedef uint16_t (*dequeue_pkt_burst_t)(void *qp, + struct rte_crypto_op **ops, uint16_t nb_ops); +/**< Dequeue processed packets from queue pair of a device. */ + +typedef uint16_t (*enqueue_pkt_burst_t)(void *qp, + struct rte_crypto_op **ops, uint16_t nb_ops); +/**< Enqueue packets for processing on queue pair of a device. */ + + + + +struct rte_cryptodev_callback; + +/** Structure to keep track of registered callbacks */ +TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback); + +/** The data structure associated with each crypto device. */ +struct rte_cryptodev { + dequeue_pkt_burst_t dequeue_burst; + /**< Pointer to PMD receive function. */ + enqueue_pkt_burst_t enqueue_burst; + /**< Pointer to PMD transmit function. */ + + struct rte_cryptodev_data *data; + /**< Pointer to device data */ + struct rte_cryptodev_ops *dev_ops; + /**< Functions exported by PMD */ + uint64_t feature_flags; + /**< Feature flags exposes HW/SW features for the given device */ + struct rte_device *device; + /**< Backing device */ + + uint8_t driver_id; + /**< Crypto driver identifier*/ + + struct rte_cryptodev_cb_list link_intr_cbs; + /**< User application callback for interrupts if present */ + + void *security_ctx; + /**< Context for security ops */ + + __extension__ + uint8_t attached : 1; + /**< Flag indicating the device is attached */ +} __rte_cache_aligned; + +void * +rte_cryptodev_get_sec_ctx(uint8_t dev_id); + +/** + * + * The data part, with no function pointers, associated with each device. + * + * This structure is safe to place in shared memory to be common among + * different processes in a multi-process configuration. + */ +struct rte_cryptodev_data { + uint8_t dev_id; + /**< Device ID for this instance */ + uint8_t socket_id; + /**< Socket ID where memory is allocated */ + char name[RTE_CRYPTODEV_NAME_MAX_LEN]; + /**< Unique identifier name */ + + __extension__ + uint8_t dev_started : 1; + /**< Device state: STARTED(1)/STOPPED(0) */ + + struct rte_mempool *session_pool; + /**< Session memory pool */ + void **queue_pairs; + /**< Array of pointers to queue pairs. */ + uint16_t nb_queue_pairs; + /**< Number of device queue pairs. */ + + void *dev_private; + /**< PMD-specific private data */ +} __rte_cache_aligned; + +extern struct rte_cryptodev *rte_cryptodevs; +/** + * + * Dequeue a burst of processed crypto operations from a queue on the crypto + * device. The dequeued operation are stored in *rte_crypto_op* structures + * whose pointers are supplied in the *ops* array. + * + * The rte_cryptodev_dequeue_burst() function returns the number of ops + * actually dequeued, which is the number of *rte_crypto_op* data structures + * effectively supplied into the *ops* array. + * + * A return value equal to *nb_ops* indicates that the queue contained + * at least *nb_ops* operations, and this is likely to signify that other + * processed operations remain in the devices output queue. Applications + * implementing a "retrieve as many processed operations as possible" policy + * can check this specific case and keep invoking the + * rte_cryptodev_dequeue_burst() function until a value less than + * *nb_ops* is returned. + * + * The rte_cryptodev_dequeue_burst() function does not provide any error + * notification to avoid the corresponding overhead. + * + * @param dev_id The symmetric crypto device identifier + * @param qp_id The index of the queue pair from which to + * retrieve processed packets. The value must be + * in the range [0, nb_queue_pair - 1] previously + * supplied to rte_cryptodev_configure(). + * @param ops The address of an array of pointers to + * *rte_crypto_op* structures that must be + * large enough to store *nb_ops* pointers in it. + * @param nb_ops The maximum number of operations to dequeue. + * + * @return + * - The number of operations actually dequeued, which is the number + * of pointers to *rte_crypto_op* structures effectively supplied to the + * *ops* array. + */ +static inline uint16_t +rte_cryptodev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_crypto_op **ops, uint16_t nb_ops) +{ + struct rte_cryptodev *dev = &rte_cryptodevs[dev_id]; + + nb_ops = (*dev->dequeue_burst) + (dev->data->queue_pairs[qp_id], ops, nb_ops); + + return nb_ops; +} + +/** + * Enqueue a burst of operations for processing on a crypto device. + * + * The rte_cryptodev_enqueue_burst() function is invoked to place + * crypto operations on the queue *qp_id* of the device designated by + * its *dev_id*. + * + * The *nb_ops* parameter is the number of operations to process which are + * supplied in the *ops* array of *rte_crypto_op* structures. + * + * The rte_cryptodev_enqueue_burst() function returns the number of + * operations it actually enqueued for processing. A return value equal to + * *nb_ops* means that all packets have been enqueued. + * + * @param dev_id The identifier of the device. + * @param qp_id The index of the queue pair which packets are + * to be enqueued for processing. The value + * must be in the range [0, nb_queue_pairs - 1] + * previously supplied to + * *rte_cryptodev_configure*. + * @param ops The address of an array of *nb_ops* pointers + * to *rte_crypto_op* structures which contain + * the crypto operations to be processed. + * @param nb_ops The number of operations to process. + * + * @return + * The number of operations actually enqueued on the crypto device. The return + * value can be less than the value of the *nb_ops* parameter when the + * crypto devices queue is full or if invalid parameters are specified in + * a *rte_crypto_op*. + */ +static inline uint16_t +rte_cryptodev_enqueue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_crypto_op **ops, uint16_t nb_ops) +{ + struct rte_cryptodev *dev = &rte_cryptodevs[dev_id]; + + return (*dev->enqueue_burst)( + dev->data->queue_pairs[qp_id], ops, nb_ops); +} + + +/** Cryptodev symmetric crypto session + * Each session is derived from a fixed xform chain. Therefore each session + * has a fixed algo, key, op-type, digest_len etc. + */ +struct rte_cryptodev_sym_session { + __extension__ void *sess_private_data[0]; + /**< Private symmetric session material */ +}; + +/** Cryptodev asymmetric crypto session */ +struct rte_cryptodev_asym_session { + __extension__ void *sess_private_data[0]; + /**< Private asymmetric session material */ +}; + +/** + * Create symmetric crypto session header (generic with no private data) + * + * @param mempool Symmetric session mempool to allocate session + * objects from + * @return + * - On success return pointer to sym-session + * - On failure returns NULL + */ +struct rte_cryptodev_sym_session * +rte_cryptodev_sym_session_create(struct rte_mempool *mempool); + +/** + * Create asymmetric crypto session header (generic with no private data) + * + * @param mempool mempool to allocate asymmetric session + * objects from + * @return + * - On success return pointer to asym-session + * - On failure returns NULL + */ +struct rte_cryptodev_asym_session * __rte_experimental +rte_cryptodev_asym_session_create(struct rte_mempool *mempool); + +/** + * Frees symmetric crypto session header, after checking that all + * the device private data has been freed, returning it + * to its original mempool. + * + * @param sess Session header to be freed. + * + * @return + * - 0 if successful. + * - -EINVAL if session is NULL. + * - -EBUSY if not all device private data has been freed. + */ +int +rte_cryptodev_sym_session_free(struct rte_cryptodev_sym_session *sess); + +/** + * Frees asymmetric crypto session header, after checking that all + * the device private data has been freed, returning it + * to its original mempool. + * + * @param sess Session header to be freed. + * + * @return + * - 0 if successful. + * - -EINVAL if session is NULL. + * - -EBUSY if not all device private data has been freed. + */ +int __rte_experimental +rte_cryptodev_asym_session_free(struct rte_cryptodev_asym_session *sess); + +/** + * Fill out private data for the device id, based on its device type. + * + * @param dev_id ID of device that we want the session to be used on + * @param sess Session where the private data will be attached to + * @param xforms Symmetric crypto transform operations to apply on flow + * processed with this session + * @param mempool Mempool where the private data is allocated. + * + * @return + * - On success, zero. + * - -EINVAL if input parameters are invalid. + * - -ENOTSUP if crypto device does not support the crypto transform or + * does not support symmetric operations. + * - -ENOMEM if the private session could not be allocated. + */ +int +rte_cryptodev_sym_session_init(uint8_t dev_id, + struct rte_cryptodev_sym_session *sess, + struct rte_crypto_sym_xform *xforms, + struct rte_mempool *mempool); + +/** + * Initialize asymmetric session on a device with specific asymmetric xform + * + * @param dev_id ID of device that we want the session to be used on + * @param sess Session to be set up on a device + * @param xforms Asymmetric crypto transform operations to apply on flow + * processed with this session + * @param mempool Mempool to be used for internal allocation. + * + * @return + * - On success, zero. + * - -EINVAL if input parameters are invalid. + * - -ENOTSUP if crypto device does not support the crypto transform. + * - -ENOMEM if the private session could not be allocated. + */ +int __rte_experimental +rte_cryptodev_asym_session_init(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess, + struct rte_crypto_asym_xform *xforms, + struct rte_mempool *mempool); + +/** + * Frees private data for the device id, based on its device type, + * returning it to its mempool. It is the application's responsibility + * to ensure that private session data is not cleared while there are + * still in-flight operations using it. + * + * @param dev_id ID of device that uses the session. + * @param sess Session containing the reference to the private data + * + * @return + * - 0 if successful. + * - -EINVAL if device is invalid or session is NULL. + * - -ENOTSUP if crypto device does not support symmetric operations. + */ +int +rte_cryptodev_sym_session_clear(uint8_t dev_id, + struct rte_cryptodev_sym_session *sess); + +/** + * Frees resources held by asymmetric session during rte_cryptodev_session_init + * + * @param dev_id ID of device that uses the asymmetric session. + * @param sess Asymmetric session setup on device using + * rte_cryptodev_session_init + * @return + * - 0 if successful. + * - -EINVAL if device is invalid or session is NULL. + */ +int __rte_experimental +rte_cryptodev_asym_session_clear(uint8_t dev_id, + struct rte_cryptodev_asym_session *sess); + +/** + * Get the size of the header session, for all registered drivers. + * + * @return + * Size of the symmetric eader session. + */ +unsigned int +rte_cryptodev_sym_get_header_session_size(void); + +/** + * Get the size of the asymmetric session header, for all registered drivers. + * + * @return + * Size of the asymmetric header session. + */ +unsigned int __rte_experimental +rte_cryptodev_asym_get_header_session_size(void); + +/** + * Get the size of the private symmetric session data + * for a device. + * + * @param dev_id The device identifier. + * + * @return + * - Size of the private data, if successful + * - 0 if device is invalid or does not have private + * symmetric session + */ +unsigned int +rte_cryptodev_sym_get_private_session_size(uint8_t dev_id); + +/** + * Get the size of the private data for asymmetric session + * on device + * + * @param dev_id The device identifier. + * + * @return + * - Size of the asymmetric private data, if successful + * - 0 if device is invalid or does not have private session + */ +unsigned int __rte_experimental +rte_cryptodev_asym_get_private_session_size(uint8_t dev_id); + +/** + * Provide driver identifier. + * + * @param name + * The pointer to a driver name. + * @return + * The driver type identifier or -1 if no driver found + */ +int rte_cryptodev_driver_id_get(const char *name); + +/** + * Provide driver name. + * + * @param driver_id + * The driver identifier. + * @return + * The driver name or null if no driver found + */ +const char *rte_cryptodev_driver_name_get(uint8_t driver_id); + +/** + * Store user data in a session. + * + * @param sess Session pointer allocated by + * *rte_cryptodev_sym_session_create*. + * @param data Pointer to the user data. + * @param size Size of the user data. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_cryptodev_sym_session_set_user_data( + struct rte_cryptodev_sym_session *sess, + void *data, + uint16_t size); + +/** + * Get user data stored in a session. + * + * @param sess Session pointer allocated by + * *rte_cryptodev_sym_session_create*. + * + * @return + * - On success return pointer to user data. + * - On failure returns NULL. + */ +void * __rte_experimental +rte_cryptodev_sym_session_get_user_data( + struct rte_cryptodev_sym_session *sess); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRYPTODEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.c b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.c new file mode 100644 index 00000000..8952616e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.c @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include + +#include "rte_cryptodev_pmd.h" + +/** + * Parse name from argument + */ +static int +rte_cryptodev_pmd_parse_name_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + struct rte_cryptodev_pmd_init_params *params = extra_args; + int n; + + n = snprintf(params->name, RTE_CRYPTODEV_NAME_MAX_LEN, "%s", value); + if (n >= RTE_CRYPTODEV_NAME_MAX_LEN) + return -EINVAL; + + return 0; +} + +/** + * Parse unsigned integer from argument + */ +static int +rte_cryptodev_pmd_parse_uint_arg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + int i; + char *end; + errno = 0; + + i = strtol(value, &end, 10); + if (*end != 0 || errno != 0 || i < 0) + return -EINVAL; + + *((uint32_t *)extra_args) = i; + return 0; +} + +int +rte_cryptodev_pmd_parse_input_args( + struct rte_cryptodev_pmd_init_params *params, + const char *args) +{ + struct rte_kvargs *kvlist = NULL; + int ret = 0; + + if (params == NULL) + return -EINVAL; + + if (args) { + kvlist = rte_kvargs_parse(args, cryptodev_pmd_valid_params); + if (kvlist == NULL) + return -EINVAL; + + ret = rte_kvargs_process(kvlist, + RTE_CRYPTODEV_PMD_MAX_NB_QP_ARG, + &rte_cryptodev_pmd_parse_uint_arg, + ¶ms->max_nb_queue_pairs); + if (ret < 0) + goto free_kvlist; + + ret = rte_kvargs_process(kvlist, + RTE_CRYPTODEV_PMD_SOCKET_ID_ARG, + &rte_cryptodev_pmd_parse_uint_arg, + ¶ms->socket_id); + if (ret < 0) + goto free_kvlist; + + ret = rte_kvargs_process(kvlist, + RTE_CRYPTODEV_PMD_NAME_ARG, + &rte_cryptodev_pmd_parse_name_arg, + params); + if (ret < 0) + goto free_kvlist; + } + +free_kvlist: + rte_kvargs_free(kvlist); + return ret; +} + +struct rte_cryptodev * +rte_cryptodev_pmd_create(const char *name, + struct rte_device *device, + struct rte_cryptodev_pmd_init_params *params) +{ + struct rte_cryptodev *cryptodev; + + if (params->name[0] != '\0') { + CDEV_LOG_INFO("[%s] User specified device name = %s\n", + device->driver->name, params->name); + name = params->name; + } + + CDEV_LOG_INFO("[%s] - Creating cryptodev %s\n", + device->driver->name, name); + + CDEV_LOG_INFO("[%s] - Initialisation parameters - name: %s," + "socket id: %d, max queue pairs: %u", + device->driver->name, name, + params->socket_id, params->max_nb_queue_pairs); + + /* allocate device structure */ + cryptodev = rte_cryptodev_pmd_allocate(name, params->socket_id); + if (cryptodev == NULL) { + CDEV_LOG_ERR("[%s] Failed to allocate crypto device for %s", + device->driver->name, name); + return NULL; + } + + /* allocate private device structure */ + if (cryptodev->data->dev_private == NULL) { + cryptodev->data->dev_private = + rte_zmalloc_socket("cryptodev device private", + params->private_data_size, + RTE_CACHE_LINE_SIZE, + params->socket_id); + + if (cryptodev->data->dev_private == NULL) { + CDEV_LOG_ERR("[%s] Cannot allocate memory for " + "cryptodev %s private data", + device->driver->name, name); + + rte_cryptodev_pmd_release_device(cryptodev); + return NULL; + } + } + + cryptodev->device = device; + + /* initialise user call-back tail queue */ + TAILQ_INIT(&(cryptodev->link_intr_cbs)); + + return cryptodev; +} + +int +rte_cryptodev_pmd_destroy(struct rte_cryptodev *cryptodev) +{ + int retval; + + CDEV_LOG_INFO("[%s] Closing crypto device %s", + cryptodev->device->driver->name, + cryptodev->device->name); + + /* free crypto device */ + retval = rte_cryptodev_pmd_release_device(cryptodev); + if (retval) + return retval; + + rte_free(cryptodev->data->dev_private); + cryptodev->data->dev_private = NULL; + + + cryptodev->device = NULL; + cryptodev->data = NULL; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.h b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.h new file mode 100644 index 00000000..6ff49d64 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_pmd.h @@ -0,0 +1,509 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2016 Intel Corporation. + */ + +#ifndef _RTE_CRYPTODEV_PMD_H_ +#define _RTE_CRYPTODEV_PMD_H_ + +/** @file + * RTE Crypto PMD APIs + * + * @note + * These API are from crypto PMD only and user applications should not call + * them directly. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "rte_crypto.h" +#include "rte_cryptodev.h" + + +#define RTE_CRYPTODEV_PMD_DEFAULT_MAX_NB_QUEUE_PAIRS 8 + +#define RTE_CRYPTODEV_PMD_NAME_ARG ("name") +#define RTE_CRYPTODEV_PMD_MAX_NB_QP_ARG ("max_nb_queue_pairs") +#define RTE_CRYPTODEV_PMD_SOCKET_ID_ARG ("socket_id") + + +static const char * const cryptodev_pmd_valid_params[] = { + RTE_CRYPTODEV_PMD_NAME_ARG, + RTE_CRYPTODEV_PMD_MAX_NB_QP_ARG, + RTE_CRYPTODEV_PMD_SOCKET_ID_ARG +}; + +/** + * @internal + * Initialisation parameters for crypto devices + */ +struct rte_cryptodev_pmd_init_params { + char name[RTE_CRYPTODEV_NAME_MAX_LEN]; + size_t private_data_size; + int socket_id; + unsigned int max_nb_queue_pairs; +}; + +/** Global structure used for maintaining state of allocated crypto devices */ +struct rte_cryptodev_global { + struct rte_cryptodev *devs; /**< Device information array */ + struct rte_cryptodev_data *data[RTE_CRYPTO_MAX_DEVS]; + /**< Device private data */ + uint8_t nb_devs; /**< Number of devices found */ + uint8_t max_devs; /**< Max number of devices */ +}; + +/* Cryptodev driver, containing the driver ID */ +struct cryptodev_driver { + TAILQ_ENTRY(cryptodev_driver) next; /**< Next in list. */ + const struct rte_driver *driver; + uint8_t id; +}; + +/** pointer to global crypto devices data structure. */ +extern struct rte_cryptodev_global *rte_cryptodev_globals; + +/** + * Get the rte_cryptodev structure device pointer for the device. Assumes a + * valid device index. + * + * @param dev_id Device ID value to select the device structure. + * + * @return + * - The rte_cryptodev structure pointer for the given device ID. + */ +struct rte_cryptodev * +rte_cryptodev_pmd_get_dev(uint8_t dev_id); + +/** + * Get the rte_cryptodev structure device pointer for the named device. + * + * @param name device name to select the device structure. + * + * @return + * - The rte_cryptodev structure pointer for the given device ID. + */ +struct rte_cryptodev * +rte_cryptodev_pmd_get_named_dev(const char *name); + +/** + * Validate if the crypto device index is valid attached crypto device. + * + * @param dev_id Crypto device index. + * + * @return + * - If the device index is valid (1) or not (0). + */ +unsigned int +rte_cryptodev_pmd_is_valid_dev(uint8_t dev_id); + +/** + * The pool of rte_cryptodev structures. + */ +extern struct rte_cryptodev *rte_cryptodevs; + + +/** + * Definitions of all functions exported by a driver through the + * the generic structure of type *crypto_dev_ops* supplied in the + * *rte_cryptodev* structure associated with a device. + */ + +/** + * Function used to configure device. + * + * @param dev Crypto device pointer + * config Crypto device configurations + * + * @return Returns 0 on success + */ +typedef int (*cryptodev_configure_t)(struct rte_cryptodev *dev, + struct rte_cryptodev_config *config); + +/** + * Function used to start a configured device. + * + * @param dev Crypto device pointer + * + * @return Returns 0 on success + */ +typedef int (*cryptodev_start_t)(struct rte_cryptodev *dev); + +/** + * Function used to stop a configured device. + * + * @param dev Crypto device pointer + */ +typedef void (*cryptodev_stop_t)(struct rte_cryptodev *dev); + +/** + * Function used to close a configured device. + * + * @param dev Crypto device pointer + * @return + * - 0 on success. + * - EAGAIN if can't close as device is busy + */ +typedef int (*cryptodev_close_t)(struct rte_cryptodev *dev); + + +/** + * Function used to get statistics of a device. + * + * @param dev Crypto device pointer + * @param stats Pointer to crypto device stats structure to populate + */ +typedef void (*cryptodev_stats_get_t)(struct rte_cryptodev *dev, + struct rte_cryptodev_stats *stats); + + +/** + * Function used to reset statistics of a device. + * + * @param dev Crypto device pointer + */ +typedef void (*cryptodev_stats_reset_t)(struct rte_cryptodev *dev); + + +/** + * Function used to get specific information of a device. + * + * @param dev Crypto device pointer + */ +typedef void (*cryptodev_info_get_t)(struct rte_cryptodev *dev, + struct rte_cryptodev_info *dev_info); + +/** + * Setup a queue pair for a device. + * + * @param dev Crypto device pointer + * @param qp_id Queue Pair Index + * @param qp_conf Queue configuration structure + * @param socket_id Socket Index + * @param session_pool Pointer to device session mempool + * + * @return Returns 0 on success. + */ +typedef int (*cryptodev_queue_pair_setup_t)(struct rte_cryptodev *dev, + uint16_t qp_id, const struct rte_cryptodev_qp_conf *qp_conf, + int socket_id, struct rte_mempool *session_pool); + +/** + * Release memory resources allocated by given queue pair. + * + * @param dev Crypto device pointer + * @param qp_id Queue Pair Index + * + * @return + * - 0 on success. + * - EAGAIN if can't close as device is busy + */ +typedef int (*cryptodev_queue_pair_release_t)(struct rte_cryptodev *dev, + uint16_t qp_id); + +/** + * Get number of available queue pairs of a device. + * + * @param dev Crypto device pointer + * + * @return Returns number of queue pairs on success. + */ +typedef uint32_t (*cryptodev_queue_pair_count_t)(struct rte_cryptodev *dev); + +/** + * Create a session mempool to allocate sessions from + * + * @param dev Crypto device pointer + * @param nb_objs number of sessions objects in mempool + * @param obj_cache l-core object cache size, see *rte_ring_create* + * @param socket_id Socket Id to allocate mempool on. + * + * @return + * - On success returns a pointer to a rte_mempool + * - On failure returns a NULL pointer + */ +typedef int (*cryptodev_sym_create_session_pool_t)( + struct rte_cryptodev *dev, unsigned nb_objs, + unsigned obj_cache_size, int socket_id); + + +/** + * Get the size of a cryptodev session + * + * @param dev Crypto device pointer + * + * @return + * - On success returns the size of the session structure for device + * - On failure returns 0 + */ +typedef unsigned (*cryptodev_sym_get_session_private_size_t)( + struct rte_cryptodev *dev); +/** + * Get the size of a asymmetric cryptodev session + * + * @param dev Crypto device pointer + * + * @return + * - On success returns the size of the session structure for device + * - On failure returns 0 + */ +typedef unsigned int (*cryptodev_asym_get_session_private_size_t)( + struct rte_cryptodev *dev); + +/** + * Configure a Crypto session on a device. + * + * @param dev Crypto device pointer + * @param xform Single or chain of crypto xforms + * @param priv_sess Pointer to cryptodev's private session structure + * @param mp Mempool where the private session is allocated + * + * @return + * - Returns 0 if private session structure have been created successfully. + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if crypto device does not support the crypto transform. + * - Returns -ENOMEM if the private session could not be allocated. + */ +typedef int (*cryptodev_sym_configure_session_t)(struct rte_cryptodev *dev, + struct rte_crypto_sym_xform *xform, + struct rte_cryptodev_sym_session *session, + struct rte_mempool *mp); +/** + * Configure a Crypto asymmetric session on a device. + * + * @param dev Crypto device pointer + * @param xform Single or chain of crypto xforms + * @param priv_sess Pointer to cryptodev's private session structure + * @param mp Mempool where the private session is allocated + * + * @return + * - Returns 0 if private session structure have been created successfully. + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if crypto device does not support the crypto transform. + * - Returns -ENOMEM if the private session could not be allocated. + */ +typedef int (*cryptodev_asym_configure_session_t)(struct rte_cryptodev *dev, + struct rte_crypto_asym_xform *xform, + struct rte_cryptodev_asym_session *session, + struct rte_mempool *mp); +/** + * Free driver private session data. + * + * @param dev Crypto device pointer + * @param sess Cryptodev session structure + */ +typedef void (*cryptodev_sym_free_session_t)(struct rte_cryptodev *dev, + struct rte_cryptodev_sym_session *sess); +/** + * Free asymmetric session private data. + * + * @param dev Crypto device pointer + * @param sess Cryptodev session structure + */ +typedef void (*cryptodev_asym_free_session_t)(struct rte_cryptodev *dev, + struct rte_cryptodev_asym_session *sess); + +/** Crypto device operations function pointer table */ +struct rte_cryptodev_ops { + cryptodev_configure_t dev_configure; /**< Configure device. */ + cryptodev_start_t dev_start; /**< Start device. */ + cryptodev_stop_t dev_stop; /**< Stop device. */ + cryptodev_close_t dev_close; /**< Close device. */ + + cryptodev_info_get_t dev_infos_get; /**< Get device info. */ + + cryptodev_stats_get_t stats_get; + /**< Get device statistics. */ + cryptodev_stats_reset_t stats_reset; + /**< Reset device statistics. */ + + cryptodev_queue_pair_setup_t queue_pair_setup; + /**< Set up a device queue pair. */ + cryptodev_queue_pair_release_t queue_pair_release; + /**< Release a queue pair. */ + cryptodev_queue_pair_count_t queue_pair_count; + /**< Get count of the queue pairs. */ + + cryptodev_sym_get_session_private_size_t sym_session_get_size; + /**< Return private session. */ + cryptodev_asym_get_session_private_size_t asym_session_get_size; + /**< Return asym session private size. */ + cryptodev_sym_configure_session_t sym_session_configure; + /**< Configure a Crypto session. */ + cryptodev_asym_configure_session_t asym_session_configure; + /**< Configure asymmetric Crypto session. */ + cryptodev_sym_free_session_t sym_session_clear; + /**< Clear a Crypto sessions private data. */ + cryptodev_asym_free_session_t asym_session_clear; + /**< Clear a Crypto sessions private data. */ +}; + + +/** + * Function for internal use by dummy drivers primarily, e.g. ring-based + * driver. + * Allocates a new cryptodev slot for an crypto device and returns the pointer + * to that slot for the driver to use. + * + * @param name Unique identifier name for each device + * @param socket_id Socket to allocate resources on. + * @return + * - Slot in the rte_dev_devices array for a new device; + */ +struct rte_cryptodev * +rte_cryptodev_pmd_allocate(const char *name, int socket_id); + +/** + * Function for internal use by dummy drivers primarily, e.g. ring-based + * driver. + * Release the specified cryptodev device. + * + * @param cryptodev + * The *cryptodev* pointer is the address of the *rte_cryptodev* structure. + * @return + * - 0 on success, negative on error + */ +extern int +rte_cryptodev_pmd_release_device(struct rte_cryptodev *cryptodev); + + +/** + * @internal + * + * PMD assist function to parse initialisation arguments for crypto driver + * when creating a new crypto PMD device instance. + * + * PMD driver should set default values for that PMD before calling function, + * these default values will be over-written with successfully parsed values + * from args string. + * + * @param params parsed PMD initialisation parameters + * @param args input argument string to parse + * + * @return + * - 0 on success + * - errno on failure + */ +int +rte_cryptodev_pmd_parse_input_args( + struct rte_cryptodev_pmd_init_params *params, + const char *args); + +/** + * @internal + * + * PMD assist function to provide boiler plate code for crypto driver to create + * and allocate resources for a new crypto PMD device instance. + * + * @param name crypto device name. + * @param device base device instance + * @param params PMD initialisation parameters + * + * @return + * - crypto device instance on success + * - NULL on creation failure + */ +struct rte_cryptodev * +rte_cryptodev_pmd_create(const char *name, + struct rte_device *device, + struct rte_cryptodev_pmd_init_params *params); + +/** + * @internal + * + * PMD assist function to provide boiler plate code for crypto driver to + * destroy and free resources associated with a crypto PMD device instance. + * + * @param cryptodev crypto device handle. + * + * @return + * - 0 on success + * - errno on failure + */ +int +rte_cryptodev_pmd_destroy(struct rte_cryptodev *cryptodev); + +/** + * Executes all the user application registered callbacks for the specific + * device. + * * + * @param dev Pointer to cryptodev struct + * @param event Crypto device interrupt event type. + * + * @return + * void + */ +void rte_cryptodev_pmd_callback_process(struct rte_cryptodev *dev, + enum rte_cryptodev_event_type event); + +/** + * @internal + * Create unique device name + */ +int +rte_cryptodev_pmd_create_dev_name(char *name, const char *dev_name_prefix); + +/** + * @internal + * Allocate Cryptodev driver. + * + * @param crypto_drv + * Pointer to cryptodev_driver. + * @param drv + * Pointer to rte_driver. + * + * @return + * The driver type identifier + */ +uint8_t rte_cryptodev_allocate_driver(struct cryptodev_driver *crypto_drv, + const struct rte_driver *drv); + + +#define RTE_PMD_REGISTER_CRYPTO_DRIVER(crypto_drv, drv, driver_id)\ +RTE_INIT(init_ ##driver_id)\ +{\ + driver_id = rte_cryptodev_allocate_driver(&crypto_drv, &(drv));\ +} + +static inline void * +get_sym_session_private_data(const struct rte_cryptodev_sym_session *sess, + uint8_t driver_id) { + return sess->sess_private_data[driver_id]; +} + +static inline void +set_sym_session_private_data(struct rte_cryptodev_sym_session *sess, + uint8_t driver_id, void *private_data) +{ + sess->sess_private_data[driver_id] = private_data; +} + +static inline void * +get_asym_session_private_data(const struct rte_cryptodev_asym_session *sess, + uint8_t driver_id) { + return sess->sess_private_data[driver_id]; +} + +static inline void +set_asym_session_private_data(struct rte_cryptodev_asym_session *sess, + uint8_t driver_id, void *private_data) +{ + sess->sess_private_data[driver_id] = private_data; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRYPTODEV_PMD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_version.map b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_version.map new file mode 100644 index 00000000..7ca00735 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_cryptodev/rte_cryptodev_version.map @@ -0,0 +1,108 @@ +DPDK_16.04 { + global: + + rte_cryptodevs; + rte_cryptodev_callback_register; + rte_cryptodev_callback_unregister; + rte_cryptodev_close; + rte_cryptodev_count; + rte_cryptodev_configure; + rte_cryptodev_get_dev_id; + rte_cryptodev_get_feature_name; + rte_cryptodev_info_get; + rte_cryptodev_pmd_allocate; + rte_cryptodev_pmd_callback_process; + rte_cryptodev_pmd_release_device; + rte_cryptodev_sym_session_create; + rte_cryptodev_sym_session_free; + rte_cryptodev_socket_id; + rte_cryptodev_start; + rte_cryptodev_stats_get; + rte_cryptodev_stats_reset; + rte_cryptodev_stop; + rte_cryptodev_queue_pair_count; + rte_cryptodev_queue_pair_setup; + rte_crypto_op_pool_create; + + local: *; +}; + +DPDK_17.02 { + global: + + rte_cryptodev_devices_get; + rte_cryptodev_pmd_create_dev_name; + rte_cryptodev_pmd_get_dev; + rte_cryptodev_pmd_get_named_dev; + rte_cryptodev_pmd_is_valid_dev; + rte_cryptodev_sym_capability_check_auth; + rte_cryptodev_sym_capability_check_cipher; + rte_cryptodev_sym_capability_get; + rte_crypto_auth_algorithm_strings; + rte_crypto_auth_operation_strings; + rte_crypto_cipher_algorithm_strings; + rte_crypto_cipher_operation_strings; + +} DPDK_16.04; + +DPDK_17.05 { + global: + + rte_cryptodev_get_auth_algo_enum; + rte_cryptodev_get_cipher_algo_enum; + +} DPDK_17.02; + +DPDK_17.08 { + global: + + rte_cryptodev_allocate_driver; + rte_cryptodev_device_count_by_driver; + rte_cryptodev_driver_id_get; + rte_cryptodev_driver_name_get; + rte_cryptodev_get_aead_algo_enum; + rte_cryptodev_sym_capability_check_aead; + rte_cryptodev_sym_session_init; + rte_cryptodev_sym_session_clear; + rte_crypto_aead_algorithm_strings; + rte_crypto_aead_operation_strings; + +} DPDK_17.05; + +DPDK_17.11 { + global: + + rte_cryptodev_get_sec_ctx; + rte_cryptodev_name_get; + rte_cryptodev_pmd_create; + rte_cryptodev_pmd_destroy; + rte_cryptodev_pmd_parse_input_args; + +} DPDK_17.08; + +DPDK_18.05 { + global: + + rte_cryptodev_sym_get_header_session_size; + rte_cryptodev_sym_get_private_session_size; + +} DPDK_17.11; + +EXPERIMENTAL { + global: + + rte_cryptodev_asym_capability_get; + rte_cryptodev_asym_get_header_session_size; + rte_cryptodev_asym_get_private_session_size; + rte_cryptodev_asym_get_xform_enum; + rte_cryptodev_asym_session_clear; + rte_cryptodev_asym_session_create; + rte_cryptodev_asym_session_free; + rte_cryptodev_asym_session_init; + rte_cryptodev_asym_xform_capability_check_modlen; + rte_cryptodev_asym_xform_capability_check_optype; + rte_cryptodev_sym_session_get_user_data; + rte_cryptodev_sym_session_set_user_data; + rte_crypto_asym_op_strings; + rte_crypto_asym_xform_strings; +}; diff --git a/src/spdk/dpdk/lib/librte_distributor/Makefile b/src/spdk/dpdk/lib/librte_distributor/Makefile new file mode 100644 index 00000000..0ef80dcf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_distributor.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal -lrte_mbuf -lrte_ethdev + +EXPORT_MAP := rte_distributor_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) := rte_distributor_v20.c +SRCS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += rte_distributor.c +ifeq ($(CONFIG_RTE_ARCH_X86),y) +SRCS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += rte_distributor_match_sse.c +else +SRCS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += rte_distributor_match_generic.c +endif + + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)-include := rte_distributor.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_distributor/meson.build b/src/spdk/dpdk/lib/librte_distributor/meson.build new file mode 100644 index 00000000..dba7e3b2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_distributor.c', 'rte_distributor_v20.c') +if arch_subdir == 'x86' + sources += files('rte_distributor_match_sse.c') +else + sources += files('rte_distributor_match_generic.c') +endif +headers = files('rte_distributor.h') +deps += ['mbuf'] diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor.c b/src/spdk/dpdk/lib/librte_distributor/rte_distributor.c new file mode 100644 index 00000000..d5059837 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor.c @@ -0,0 +1,658 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_distributor_private.h" +#include "rte_distributor.h" +#include "rte_distributor_v20.h" +#include "rte_distributor_v1705.h" + +TAILQ_HEAD(rte_dist_burst_list, rte_distributor); + +static struct rte_tailq_elem rte_dist_burst_tailq = { + .name = "RTE_DIST_BURST", +}; +EAL_REGISTER_TAILQ(rte_dist_burst_tailq) + +/**** APIs called by workers ****/ + +/**** Burst Packet APIs called by workers ****/ + +void +rte_distributor_request_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, + unsigned int count) +{ + struct rte_distributor_buffer *buf = &(d->bufs[worker_id]); + unsigned int i; + + volatile int64_t *retptr64; + + if (unlikely(d->alg_type == RTE_DIST_ALG_SINGLE)) { + rte_distributor_request_pkt_v20(d->d_v20, + worker_id, oldpkt[0]); + return; + } + + retptr64 = &(buf->retptr64[0]); + /* Spin while handshake bits are set (scheduler clears it) */ + while (unlikely(*retptr64 & RTE_DISTRIB_GET_BUF)) { + rte_pause(); + uint64_t t = rte_rdtsc()+100; + + while (rte_rdtsc() < t) + rte_pause(); + } + + /* + * OK, if we've got here, then the scheduler has just cleared the + * handshake bits. Populate the retptrs with returning packets. + */ + + for (i = count; i < RTE_DIST_BURST_SIZE; i++) + buf->retptr64[i] = 0; + + /* Set Return bit for each packet returned */ + for (i = count; i-- > 0; ) + buf->retptr64[i] = + (((int64_t)(uintptr_t)(oldpkt[i])) << + RTE_DISTRIB_FLAG_BITS) | RTE_DISTRIB_RETURN_BUF; + + /* + * Finally, set the GET_BUF to signal to distributor that cache + * line is ready for processing + */ + *retptr64 |= RTE_DISTRIB_GET_BUF; +} +BIND_DEFAULT_SYMBOL(rte_distributor_request_pkt, _v1705, 17.05); +MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, + unsigned int count), + rte_distributor_request_pkt_v1705); + +int +rte_distributor_poll_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **pkts) +{ + struct rte_distributor_buffer *buf = &d->bufs[worker_id]; + uint64_t ret; + int count = 0; + unsigned int i; + + if (unlikely(d->alg_type == RTE_DIST_ALG_SINGLE)) { + pkts[0] = rte_distributor_poll_pkt_v20(d->d_v20, worker_id); + return (pkts[0]) ? 1 : 0; + } + + /* If bit is set, return */ + if (buf->bufptr64[0] & RTE_DISTRIB_GET_BUF) + return -1; + + /* since bufptr64 is signed, this should be an arithmetic shift */ + for (i = 0; i < RTE_DIST_BURST_SIZE; i++) { + if (likely(buf->bufptr64[i] & RTE_DISTRIB_VALID_BUF)) { + ret = buf->bufptr64[i] >> RTE_DISTRIB_FLAG_BITS; + pkts[count++] = (struct rte_mbuf *)((uintptr_t)(ret)); + } + } + + /* + * so now we've got the contents of the cacheline into an array of + * mbuf pointers, so toggle the bit so scheduler can start working + * on the next cacheline while we're working. + */ + buf->bufptr64[0] |= RTE_DISTRIB_GET_BUF; + + return count; +} +BIND_DEFAULT_SYMBOL(rte_distributor_poll_pkt, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_distributor_poll_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **pkts), + rte_distributor_poll_pkt_v1705); + +int +rte_distributor_get_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **pkts, + struct rte_mbuf **oldpkt, unsigned int return_count) +{ + int count; + + if (unlikely(d->alg_type == RTE_DIST_ALG_SINGLE)) { + if (return_count <= 1) { + pkts[0] = rte_distributor_get_pkt_v20(d->d_v20, + worker_id, oldpkt[0]); + return (pkts[0]) ? 1 : 0; + } else + return -EINVAL; + } + + rte_distributor_request_pkt(d, worker_id, oldpkt, return_count); + + count = rte_distributor_poll_pkt(d, worker_id, pkts); + while (count == -1) { + uint64_t t = rte_rdtsc() + 100; + + while (rte_rdtsc() < t) + rte_pause(); + + count = rte_distributor_poll_pkt(d, worker_id, pkts); + } + return count; +} +BIND_DEFAULT_SYMBOL(rte_distributor_get_pkt, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_distributor_get_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **pkts, + struct rte_mbuf **oldpkt, unsigned int return_count), + rte_distributor_get_pkt_v1705); + +int +rte_distributor_return_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, int num) +{ + struct rte_distributor_buffer *buf = &d->bufs[worker_id]; + unsigned int i; + + if (unlikely(d->alg_type == RTE_DIST_ALG_SINGLE)) { + if (num == 1) + return rte_distributor_return_pkt_v20(d->d_v20, + worker_id, oldpkt[0]); + else + return -EINVAL; + } + + for (i = 0; i < RTE_DIST_BURST_SIZE; i++) + /* Switch off the return bit first */ + buf->retptr64[i] &= ~RTE_DISTRIB_RETURN_BUF; + + for (i = num; i-- > 0; ) + buf->retptr64[i] = (((int64_t)(uintptr_t)oldpkt[i]) << + RTE_DISTRIB_FLAG_BITS) | RTE_DISTRIB_RETURN_BUF; + + /* set the GET_BUF but even if we got no returns */ + buf->retptr64[0] |= RTE_DISTRIB_GET_BUF; + + return 0; +} +BIND_DEFAULT_SYMBOL(rte_distributor_return_pkt, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_distributor_return_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, int num), + rte_distributor_return_pkt_v1705); + +/**** APIs called on distributor core ***/ + +/* stores a packet returned from a worker inside the returns array */ +static inline void +store_return(uintptr_t oldbuf, struct rte_distributor *d, + unsigned int *ret_start, unsigned int *ret_count) +{ + if (!oldbuf) + return; + /* store returns in a circular buffer */ + d->returns.mbufs[(*ret_start + *ret_count) & RTE_DISTRIB_RETURNS_MASK] + = (void *)oldbuf; + *ret_start += (*ret_count == RTE_DISTRIB_RETURNS_MASK); + *ret_count += (*ret_count != RTE_DISTRIB_RETURNS_MASK); +} + +/* + * Match then flow_ids (tags) of the incoming packets to the flow_ids + * of the inflight packets (both inflight on the workers and in each worker + * backlog). This will then allow us to pin those packets to the relevant + * workers to give us our atomic flow pinning. + */ +void +find_match_scalar(struct rte_distributor *d, + uint16_t *data_ptr, + uint16_t *output_ptr) +{ + struct rte_distributor_backlog *bl; + uint16_t i, j, w; + + /* + * Function overview: + * 1. Loop through all worker ID's + * 2. Compare the current inflights to the incoming tags + * 3. Compare the current backlog to the incoming tags + * 4. Add any matches to the output + */ + + for (j = 0 ; j < RTE_DIST_BURST_SIZE; j++) + output_ptr[j] = 0; + + for (i = 0; i < d->num_workers; i++) { + bl = &d->backlog[i]; + + for (j = 0; j < RTE_DIST_BURST_SIZE ; j++) + for (w = 0; w < RTE_DIST_BURST_SIZE; w++) + if (d->in_flight_tags[i][j] == data_ptr[w]) { + output_ptr[j] = i+1; + break; + } + for (j = 0; j < RTE_DIST_BURST_SIZE; j++) + for (w = 0; w < RTE_DIST_BURST_SIZE; w++) + if (bl->tags[j] == data_ptr[w]) { + output_ptr[j] = i+1; + break; + } + } + + /* + * At this stage, the output contains 8 16-bit values, with + * each non-zero value containing the worker ID on which the + * corresponding flow is pinned to. + */ +} + + +/* + * When the handshake bits indicate that there are packets coming + * back from the worker, this function is called to copy and store + * the valid returned pointers (store_return). + */ +static unsigned int +handle_returns(struct rte_distributor *d, unsigned int wkr) +{ + struct rte_distributor_buffer *buf = &(d->bufs[wkr]); + uintptr_t oldbuf; + unsigned int ret_start = d->returns.start, + ret_count = d->returns.count; + unsigned int count = 0; + unsigned int i; + + if (buf->retptr64[0] & RTE_DISTRIB_GET_BUF) { + for (i = 0; i < RTE_DIST_BURST_SIZE; i++) { + if (buf->retptr64[i] & RTE_DISTRIB_RETURN_BUF) { + oldbuf = ((uintptr_t)(buf->retptr64[i] >> + RTE_DISTRIB_FLAG_BITS)); + /* store returns in a circular buffer */ + store_return(oldbuf, d, &ret_start, &ret_count); + count++; + buf->retptr64[i] &= ~RTE_DISTRIB_RETURN_BUF; + } + } + d->returns.start = ret_start; + d->returns.count = ret_count; + /* Clear for the worker to populate with more returns */ + buf->retptr64[0] = 0; + } + return count; +} + +/* + * This function releases a burst (cache line) to a worker. + * It is called from the process function when a cacheline is + * full to make room for more packets for that worker, or when + * all packets have been assigned to bursts and need to be flushed + * to the workers. + * It also needs to wait for any outstanding packets from the worker + * before sending out new packets. + */ +static unsigned int +release(struct rte_distributor *d, unsigned int wkr) +{ + struct rte_distributor_buffer *buf = &(d->bufs[wkr]); + unsigned int i; + + while (!(d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF)) + rte_pause(); + + handle_returns(d, wkr); + + buf->count = 0; + + for (i = 0; i < d->backlog[wkr].count; i++) { + d->bufs[wkr].bufptr64[i] = d->backlog[wkr].pkts[i] | + RTE_DISTRIB_GET_BUF | RTE_DISTRIB_VALID_BUF; + d->in_flight_tags[wkr][i] = d->backlog[wkr].tags[i]; + } + buf->count = i; + for ( ; i < RTE_DIST_BURST_SIZE ; i++) { + buf->bufptr64[i] = RTE_DISTRIB_GET_BUF; + d->in_flight_tags[wkr][i] = 0; + } + + d->backlog[wkr].count = 0; + + /* Clear the GET bit */ + buf->bufptr64[0] &= ~RTE_DISTRIB_GET_BUF; + return buf->count; + +} + + +/* process a set of packets to distribute them to workers */ +int +rte_distributor_process_v1705(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int num_mbufs) +{ + unsigned int next_idx = 0; + static unsigned int wkr; + struct rte_mbuf *next_mb = NULL; + int64_t next_value = 0; + uint16_t new_tag = 0; + uint16_t flows[RTE_DIST_BURST_SIZE] __rte_cache_aligned; + unsigned int i, j, w, wid; + + if (d->alg_type == RTE_DIST_ALG_SINGLE) { + /* Call the old API */ + return rte_distributor_process_v20(d->d_v20, mbufs, num_mbufs); + } + + if (unlikely(num_mbufs == 0)) { + /* Flush out all non-full cache-lines to workers. */ + for (wid = 0 ; wid < d->num_workers; wid++) { + if (d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF) { + release(d, wid); + handle_returns(d, wid); + } + } + return 0; + } + + while (next_idx < num_mbufs) { + uint16_t matches[RTE_DIST_BURST_SIZE]; + unsigned int pkts; + + if (d->bufs[wkr].bufptr64[0] & RTE_DISTRIB_GET_BUF) + d->bufs[wkr].count = 0; + + if ((num_mbufs - next_idx) < RTE_DIST_BURST_SIZE) + pkts = num_mbufs - next_idx; + else + pkts = RTE_DIST_BURST_SIZE; + + for (i = 0; i < pkts; i++) { + if (mbufs[next_idx + i]) { + /* flows have to be non-zero */ + flows[i] = mbufs[next_idx + i]->hash.usr | 1; + } else + flows[i] = 0; + } + for (; i < RTE_DIST_BURST_SIZE; i++) + flows[i] = 0; + + switch (d->dist_match_fn) { + case RTE_DIST_MATCH_VECTOR: + find_match_vec(d, &flows[0], &matches[0]); + break; + default: + find_match_scalar(d, &flows[0], &matches[0]); + } + + /* + * Matches array now contain the intended worker ID (+1) of + * the incoming packets. Any zeroes need to be assigned + * workers. + */ + + for (j = 0; j < pkts; j++) { + + next_mb = mbufs[next_idx++]; + next_value = (((int64_t)(uintptr_t)next_mb) << + RTE_DISTRIB_FLAG_BITS); + /* + * User is advocated to set tag value for each + * mbuf before calling rte_distributor_process. + * User defined tags are used to identify flows, + * or sessions. + */ + /* flows MUST be non-zero */ + new_tag = (uint16_t)(next_mb->hash.usr) | 1; + + /* + * Uncommenting the next line will cause the find_match + * function to be optimized out, making this function + * do parallel (non-atomic) distribution + */ + /* matches[j] = 0; */ + + if (matches[j]) { + struct rte_distributor_backlog *bl = + &d->backlog[matches[j]-1]; + if (unlikely(bl->count == + RTE_DIST_BURST_SIZE)) { + release(d, matches[j]-1); + } + + /* Add to worker that already has flow */ + unsigned int idx = bl->count++; + + bl->tags[idx] = new_tag; + bl->pkts[idx] = next_value; + + } else { + struct rte_distributor_backlog *bl = + &d->backlog[wkr]; + if (unlikely(bl->count == + RTE_DIST_BURST_SIZE)) { + release(d, wkr); + } + + /* Add to current worker worker */ + unsigned int idx = bl->count++; + + bl->tags[idx] = new_tag; + bl->pkts[idx] = next_value; + /* + * Now that we've just added an unpinned flow + * to a worker, we need to ensure that all + * other packets with that same flow will go + * to the same worker in this burst. + */ + for (w = j; w < pkts; w++) + if (flows[w] == new_tag) + matches[w] = wkr+1; + } + } + wkr++; + if (wkr >= d->num_workers) + wkr = 0; + } + + /* Flush out all non-full cache-lines to workers. */ + for (wid = 0 ; wid < d->num_workers; wid++) + if ((d->bufs[wid].bufptr64[0] & RTE_DISTRIB_GET_BUF)) + release(d, wid); + + return num_mbufs; +} +BIND_DEFAULT_SYMBOL(rte_distributor_process, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_distributor_process(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int num_mbufs), + rte_distributor_process_v1705); + +/* return to the caller, packets returned from workers */ +int +rte_distributor_returned_pkts_v1705(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int max_mbufs) +{ + struct rte_distributor_returned_pkts *returns = &d->returns; + unsigned int retval = (max_mbufs < returns->count) ? + max_mbufs : returns->count; + unsigned int i; + + if (d->alg_type == RTE_DIST_ALG_SINGLE) { + /* Call the old API */ + return rte_distributor_returned_pkts_v20(d->d_v20, + mbufs, max_mbufs); + } + + for (i = 0; i < retval; i++) { + unsigned int idx = (returns->start + i) & + RTE_DISTRIB_RETURNS_MASK; + + mbufs[i] = returns->mbufs[idx]; + } + returns->start += i; + returns->count -= i; + + return retval; +} +BIND_DEFAULT_SYMBOL(rte_distributor_returned_pkts, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_distributor_returned_pkts(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int max_mbufs), + rte_distributor_returned_pkts_v1705); + +/* + * Return the number of packets in-flight in a distributor, i.e. packets + * being worked on or queued up in a backlog. + */ +static inline unsigned int +total_outstanding(const struct rte_distributor *d) +{ + unsigned int wkr, total_outstanding = 0; + + for (wkr = 0; wkr < d->num_workers; wkr++) + total_outstanding += d->backlog[wkr].count; + + return total_outstanding; +} + +/* + * Flush the distributor, so that there are no outstanding packets in flight or + * queued up. + */ +int +rte_distributor_flush_v1705(struct rte_distributor *d) +{ + unsigned int flushed; + unsigned int wkr; + + if (d->alg_type == RTE_DIST_ALG_SINGLE) { + /* Call the old API */ + return rte_distributor_flush_v20(d->d_v20); + } + + flushed = total_outstanding(d); + + while (total_outstanding(d) > 0) + rte_distributor_process(d, NULL, 0); + + /* + * Send empty burst to all workers to allow them to exit + * gracefully, should they need to. + */ + rte_distributor_process(d, NULL, 0); + + for (wkr = 0; wkr < d->num_workers; wkr++) + handle_returns(d, wkr); + + return flushed; +} +BIND_DEFAULT_SYMBOL(rte_distributor_flush, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_distributor_flush(struct rte_distributor *d), + rte_distributor_flush_v1705); + +/* clears the internal returns array in the distributor */ +void +rte_distributor_clear_returns_v1705(struct rte_distributor *d) +{ + unsigned int wkr; + + if (d->alg_type == RTE_DIST_ALG_SINGLE) { + /* Call the old API */ + rte_distributor_clear_returns_v20(d->d_v20); + return; + } + + /* throw away returns, so workers can exit */ + for (wkr = 0; wkr < d->num_workers; wkr++) + d->bufs[wkr].retptr64[0] = 0; +} +BIND_DEFAULT_SYMBOL(rte_distributor_clear_returns, _v1705, 17.05); +MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct rte_distributor *d), + rte_distributor_clear_returns_v1705); + +/* creates a distributor instance */ +struct rte_distributor * +rte_distributor_create_v1705(const char *name, + unsigned int socket_id, + unsigned int num_workers, + unsigned int alg_type) +{ + struct rte_distributor *d; + struct rte_dist_burst_list *dist_burst_list; + char mz_name[RTE_MEMZONE_NAMESIZE]; + const struct rte_memzone *mz; + unsigned int i; + + /* TODO Reorganise function properly around RTE_DIST_ALG_SINGLE/BURST */ + + /* compilation-time checks */ + RTE_BUILD_BUG_ON((sizeof(*d) & RTE_CACHE_LINE_MASK) != 0); + RTE_BUILD_BUG_ON((RTE_DISTRIB_MAX_WORKERS & 7) != 0); + + if (alg_type == RTE_DIST_ALG_SINGLE) { + d = malloc(sizeof(struct rte_distributor)); + if (d == NULL) { + rte_errno = ENOMEM; + return NULL; + } + d->d_v20 = rte_distributor_create_v20(name, + socket_id, num_workers); + if (d->d_v20 == NULL) { + free(d); + /* rte_errno will have been set */ + return NULL; + } + d->alg_type = alg_type; + return d; + } + + if (name == NULL || num_workers >= RTE_DISTRIB_MAX_WORKERS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mz_name, sizeof(mz_name), RTE_DISTRIB_PREFIX"%s", name); + mz = rte_memzone_reserve(mz_name, sizeof(*d), socket_id, NO_FLAGS); + if (mz == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + d = mz->addr; + snprintf(d->name, sizeof(d->name), "%s", name); + d->num_workers = num_workers; + d->alg_type = alg_type; + + d->dist_match_fn = RTE_DIST_MATCH_SCALAR; +#if defined(RTE_ARCH_X86) + d->dist_match_fn = RTE_DIST_MATCH_VECTOR; +#endif + + /* + * Set up the backlog tags so they're pointing at the second cache + * line for performance during flow matching + */ + for (i = 0 ; i < num_workers ; i++) + d->backlog[i].tags = &d->in_flight_tags[i][RTE_DIST_BURST_SIZE]; + + dist_burst_list = RTE_TAILQ_CAST(rte_dist_burst_tailq.head, + rte_dist_burst_list); + + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_INSERT_TAIL(dist_burst_list, d, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return d; +} +BIND_DEFAULT_SYMBOL(rte_distributor_create, _v1705, 17.05); +MAP_STATIC_SYMBOL(struct rte_distributor *rte_distributor_create( + const char *name, unsigned int socket_id, + unsigned int num_workers, unsigned int alg_type), + rte_distributor_create_v1705); diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor.h b/src/spdk/dpdk/lib/librte_distributor/rte_distributor.h new file mode 100644 index 00000000..327c0c4a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_DISTRIBUTOR_H_ +#define _RTE_DISTRIBUTOR_H_ + +/** + * @file + * RTE distributor + * + * The distributor is a component which is designed to pass packets + * one-at-a-time to workers, with dynamic load balancing. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Type of distribution (burst/single) */ +enum rte_distributor_alg_type { + RTE_DIST_ALG_BURST = 0, + RTE_DIST_ALG_SINGLE, + RTE_DIST_NUM_ALG_TYPES +}; + +struct rte_distributor; +struct rte_mbuf; + +/** + * Function to create a new distributor instance + * + * Reserves the memory needed for the distributor operation and + * initializes the distributor to work with the configured number of workers. + * + * @param name + * The name to be given to the distributor instance. + * @param socket_id + * The NUMA node on which the memory is to be allocated + * @param num_workers + * The maximum number of workers that will request packets from this + * distributor + * @param alg_type + * Call the legacy API, or use the new burst API. legacy uses 32-bit + * flow ID, and works on a single packet at a time. Latest uses 15- + * bit flow ID and works on up to 8 packets at a time to workers. + * @return + * The newly created distributor instance + */ +struct rte_distributor * +rte_distributor_create(const char *name, unsigned int socket_id, + unsigned int num_workers, + unsigned int alg_type); + +/* *** APIS to be called on the distributor lcore *** */ +/* + * The following APIs are the public APIs which are designed for use on a + * single lcore which acts as the distributor lcore for a given distributor + * instance. These functions cannot be called on multiple cores simultaneously + * without using locking to protect access to the internals of the distributor. + * + * NOTE: a given lcore cannot act as both a distributor lcore and a worker lcore + * for the same distributor instance, otherwise deadlock will result. + */ + +/** + * Process a set of packets by distributing them among workers that request + * packets. The distributor will ensure that no two packets that have the + * same flow id, or tag, in the mbuf will be processed on different cores at + * the same time. + * + * The user is advocated to set tag for each mbuf before calling this function. + * If user doesn't set the tag, the tag value can be various values depending on + * driver implementation and configuration. + * + * This is not multi-thread safe and should only be called on a single lcore. + * + * @param d + * The distributor instance to be used + * @param mbufs + * The mbufs to be distributed + * @param num_mbufs + * The number of mbufs in the mbufs array + * @return + * The number of mbufs processed. + */ +int +rte_distributor_process(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int num_mbufs); + +/** + * Get a set of mbufs that have been returned to the distributor by workers + * + * This should only be called on the same lcore as rte_distributor_process() + * + * @param d + * The distributor instance to be used + * @param mbufs + * The mbufs pointer array to be filled in + * @param max_mbufs + * The size of the mbufs array + * @return + * The number of mbufs returned in the mbufs array. + */ +int +rte_distributor_returned_pkts(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int max_mbufs); + +/** + * Flush the distributor component, so that there are no in-flight or + * backlogged packets awaiting processing + * + * This should only be called on the same lcore as rte_distributor_process() + * + * @param d + * The distributor instance to be used + * @return + * The number of queued/in-flight packets that were completed by this call. + */ +int +rte_distributor_flush(struct rte_distributor *d); + +/** + * Clears the array of returned packets used as the source for the + * rte_distributor_returned_pkts() API call. + * + * This should only be called on the same lcore as rte_distributor_process() + * + * @param d + * The distributor instance to be used + */ +void +rte_distributor_clear_returns(struct rte_distributor *d); + +/* *** APIS to be called on the worker lcores *** */ +/* + * The following APIs are the public APIs which are designed for use on + * multiple lcores which act as workers for a distributor. Each lcore should use + * a unique worker id when requesting packets. + * + * NOTE: a given lcore cannot act as both a distributor lcore and a worker lcore + * for the same distributor instance, otherwise deadlock will result. + */ + +/** + * API called by a worker to get new packets to process. Any previous packets + * given to the worker is assumed to have completed processing, and may be + * optionally returned to the distributor via the oldpkt parameter. + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param pkts + * The mbufs pointer array to be filled in (up to 8 packets) + * @param oldpkt + * The previous packet, if any, being processed by the worker + * @param retcount + * The number of packets being returned + * + * @return + * The number of packets in the pkts array + */ +int +rte_distributor_get_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **pkts, + struct rte_mbuf **oldpkt, unsigned int retcount); + +/** + * API called by a worker to return a completed packet without requesting a + * new packet, for example, because a worker thread is shutting down + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param oldpkt + * The previous packets being processed by the worker + * @param num + * The number of packets in the oldpkt array + */ +int +rte_distributor_return_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, int num); + +/** + * API called by a worker to request a new packet to process. + * Any previous packet given to the worker is assumed to have completed + * processing, and may be optionally returned to the distributor via + * the oldpkt parameter. + * Unlike rte_distributor_get_pkt_burst(), this function does not wait for a + * new packet to be provided by the distributor. + * + * NOTE: after calling this function, rte_distributor_poll_pkt_burst() should + * be used to poll for the packet requested. The rte_distributor_get_pkt_burst() + * API should *not* be used to try and retrieve the new packet. + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param oldpkt + * The returning packets, if any, processed by the worker + * @param count + * The number of returning packets + */ +void +rte_distributor_request_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, + unsigned int count); + +/** + * API called by a worker to check for a new packet that was previously + * requested by a call to rte_distributor_request_pkt(). It does not wait + * for the new packet to be available, but returns NULL if the request has + * not yet been fulfilled by the distributor. + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param mbufs + * The array of mbufs being given to the worker + * + * @return + * The number of packets being given to the worker thread, zero if no + * packet is yet available. + */ +int +rte_distributor_poll_pkt(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **mbufs); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_generic.c b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_generic.c new file mode 100644 index 00000000..6b681032 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_generic.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include "rte_distributor_private.h" +#include "rte_distributor.h" + +void +find_match_vec(struct rte_distributor *d, + uint16_t *data_ptr, + uint16_t *output_ptr) +{ + find_match_scalar(d, data_ptr, output_ptr); +} diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_sse.c b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_sse.c new file mode 100644 index 00000000..f7d16523 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_match_sse.c @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include "rte_distributor_private.h" +#include "rte_distributor.h" +#include "smmintrin.h" +#include "nmmintrin.h" + + +void +find_match_vec(struct rte_distributor *d, + uint16_t *data_ptr, + uint16_t *output_ptr) +{ + /* Setup */ + __m128i incoming_fids; + __m128i inflight_fids; + __m128i preflight_fids; + __m128i wkr; + __m128i mask1; + __m128i mask2; + __m128i output; + struct rte_distributor_backlog *bl; + uint16_t i; + + /* + * Function overview: + * 2. Loop through all worker ID's + * 2a. Load the current inflights for that worker into an xmm reg + * 2b. Load the current backlog for that worker into an xmm reg + * 2c. use cmpestrm to intersect flow_ids with backlog and inflights + * 2d. Add any matches to the output + * 3. Write the output xmm (matching worker ids). + */ + + + output = _mm_set1_epi16(0); + incoming_fids = _mm_load_si128((__m128i *)data_ptr); + + for (i = 0; i < d->num_workers; i++) { + bl = &d->backlog[i]; + + inflight_fids = + _mm_load_si128((__m128i *)&(d->in_flight_tags[i])); + preflight_fids = + _mm_load_si128((__m128i *)(bl->tags)); + + /* + * Any incoming_fid that exists anywhere in inflight_fids will + * have 0xffff in same position of the mask as the incoming fid + * Example (shortened to bytes for brevity): + * incoming_fids 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 + * inflight_fids 0x03 0x05 0x07 0x00 0x00 0x00 0x00 0x00 + * mask 0x00 0x00 0xff 0x00 0xff 0x00 0xff 0x00 + */ + + mask1 = _mm_cmpestrm(inflight_fids, 8, incoming_fids, 8, + _SIDD_UWORD_OPS | + _SIDD_CMP_EQUAL_ANY | + _SIDD_UNIT_MASK); + mask2 = _mm_cmpestrm(preflight_fids, 8, incoming_fids, 8, + _SIDD_UWORD_OPS | + _SIDD_CMP_EQUAL_ANY | + _SIDD_UNIT_MASK); + + mask1 = _mm_or_si128(mask1, mask2); + /* + * Now mask contains 0xffff where there's a match. + * Next we need to store the worker_id in the relevant position + * in the output. + */ + + wkr = _mm_set1_epi16(i+1); + mask1 = _mm_and_si128(mask1, wkr); + output = _mm_or_si128(mask1, output); + } + + /* + * At this stage, the output 128-bit contains 8 16-bit values, with + * each non-zero value containing the worker ID on which the + * corresponding flow is pinned to. + */ + _mm_store_si128((__m128i *)output_ptr, output); +} diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_private.h b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_private.h new file mode 100644 index 00000000..fce68c95 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_private.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_DIST_PRIV_H_ +#define _RTE_DIST_PRIV_H_ + +/** + * @file + * RTE distributor + * + * The distributor is a component which is designed to pass packets + * one-at-a-time to workers, with dynamic load balancing. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NO_FLAGS 0 +#define RTE_DISTRIB_PREFIX "DT_" + +/* + * We will use the bottom four bits of pointer for flags, shifting out + * the top four bits to make room (since a 64-bit pointer actually only uses + * 48 bits). An arithmetic-right-shift will then appropriately restore the + * original pointer value with proper sign extension into the top bits. + */ +#define RTE_DISTRIB_FLAG_BITS 4 +#define RTE_DISTRIB_FLAGS_MASK (0x0F) +#define RTE_DISTRIB_NO_BUF 0 /**< empty flags: no buffer requested */ +#define RTE_DISTRIB_GET_BUF (1) /**< worker requests a buffer, returns old */ +#define RTE_DISTRIB_RETURN_BUF (2) /**< worker returns a buffer, no request */ +#define RTE_DISTRIB_VALID_BUF (4) /**< set if bufptr contains ptr */ + +#define RTE_DISTRIB_BACKLOG_SIZE 8 +#define RTE_DISTRIB_BACKLOG_MASK (RTE_DISTRIB_BACKLOG_SIZE - 1) + +#define RTE_DISTRIB_MAX_RETURNS 128 +#define RTE_DISTRIB_RETURNS_MASK (RTE_DISTRIB_MAX_RETURNS - 1) + +/** + * Maximum number of workers allowed. + * Be aware of increasing the limit, becaus it is limited by how we track + * in-flight tags. See in_flight_bitmask and rte_distributor_process + */ +#define RTE_DISTRIB_MAX_WORKERS 64 + +#define RTE_DISTRIBUTOR_NAMESIZE 32 /**< Length of name for instance */ + +/** + * Buffer structure used to pass the pointer data between cores. This is cache + * line aligned, but to improve performance and prevent adjacent cache-line + * prefetches of buffers for other workers, e.g. when worker 1's buffer is on + * the next cache line to worker 0, we pad this out to three cache lines. + * Only 64-bits of the memory is actually used though. + */ +union rte_distributor_buffer_v20 { + volatile int64_t bufptr64; + char pad[RTE_CACHE_LINE_SIZE*3]; +} __rte_cache_aligned; + +/* + * Transfer up to 8 mbufs at a time to/from workers, and + * flow matching algorithm optimized for 8 flow IDs at a time + */ +#define RTE_DIST_BURST_SIZE 8 + +struct rte_distributor_backlog { + unsigned int start; + unsigned int count; + int64_t pkts[RTE_DIST_BURST_SIZE] __rte_cache_aligned; + uint16_t *tags; /* will point to second cacheline of inflights */ +} __rte_cache_aligned; + + +struct rte_distributor_returned_pkts { + unsigned int start; + unsigned int count; + struct rte_mbuf *mbufs[RTE_DISTRIB_MAX_RETURNS]; +}; + +struct rte_distributor_v20 { + TAILQ_ENTRY(rte_distributor_v20) next; /**< Next in list. */ + + char name[RTE_DISTRIBUTOR_NAMESIZE]; /**< Name of the ring. */ + unsigned int num_workers; /**< Number of workers polling */ + + uint32_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS]; + /**< Tracks the tag being processed per core */ + uint64_t in_flight_bitmask; + /**< on/off bits for in-flight tags. + * Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 then + * the bitmask has to expand. + */ + + struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]; + + union rte_distributor_buffer_v20 bufs[RTE_DISTRIB_MAX_WORKERS]; + + struct rte_distributor_returned_pkts returns; +}; + +/* All different signature compare functions */ +enum rte_distributor_match_function { + RTE_DIST_MATCH_SCALAR = 0, + RTE_DIST_MATCH_VECTOR, + RTE_DIST_NUM_MATCH_FNS +}; + +/** + * Buffer structure used to pass the pointer data between cores. This is cache + * line aligned, but to improve performance and prevent adjacent cache-line + * prefetches of buffers for other workers, e.g. when worker 1's buffer is on + * the next cache line to worker 0, we pad this out to two cache lines. + * We can pass up to 8 mbufs at a time in one cacheline. + * There is a separate cacheline for returns in the burst API. + */ +struct rte_distributor_buffer { + volatile int64_t bufptr64[RTE_DIST_BURST_SIZE] + __rte_cache_aligned; /* <= outgoing to worker */ + + int64_t pad1 __rte_cache_aligned; /* <= one cache line */ + + volatile int64_t retptr64[RTE_DIST_BURST_SIZE] + __rte_cache_aligned; /* <= incoming from worker */ + + int64_t pad2 __rte_cache_aligned; /* <= one cache line */ + + int count __rte_cache_aligned; /* <= number of current mbufs */ +}; + +struct rte_distributor { + TAILQ_ENTRY(rte_distributor) next; /**< Next in list. */ + + char name[RTE_DISTRIBUTOR_NAMESIZE]; /**< Name of the ring. */ + unsigned int num_workers; /**< Number of workers polling */ + unsigned int alg_type; /**< Number of alg types */ + + /**> + * First cache line in the this array are the tags inflight + * on the worker core. Second cache line are the backlog + * that are going to go to the worker core. + */ + uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2] + __rte_cache_aligned; + + struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS] + __rte_cache_aligned; + + struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS]; + + struct rte_distributor_returned_pkts returns; + + enum rte_distributor_match_function dist_match_fn; + + struct rte_distributor_v20 *d_v20; +}; + +void +find_match_scalar(struct rte_distributor *d, + uint16_t *data_ptr, + uint16_t *output_ptr); + +void +find_match_vec(struct rte_distributor *d, + uint16_t *data_ptr, + uint16_t *output_ptr); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v1705.h b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v1705.h new file mode 100644 index 00000000..df4d9e81 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v1705.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_DISTRIB_V1705_H_ +#define _RTE_DISTRIB_V1705_H_ + +/** + * @file + * RTE distributor + * + * The distributor is a component which is designed to pass packets + * one-at-a-time to workers, with dynamic load balancing. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +struct rte_distributor * +rte_distributor_create_v1705(const char *name, unsigned int socket_id, + unsigned int num_workers, + unsigned int alg_type); + +int +rte_distributor_process_v1705(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int num_mbufs); + +int +rte_distributor_returned_pkts_v1705(struct rte_distributor *d, + struct rte_mbuf **mbufs, unsigned int max_mbufs); + +int +rte_distributor_flush_v1705(struct rte_distributor *d); + +void +rte_distributor_clear_returns_v1705(struct rte_distributor *d); + +int +rte_distributor_get_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **pkts, + struct rte_mbuf **oldpkt, unsigned int retcount); + +int +rte_distributor_return_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, int num); + +void +rte_distributor_request_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **oldpkt, + unsigned int count); + +int +rte_distributor_poll_pkt_v1705(struct rte_distributor *d, + unsigned int worker_id, struct rte_mbuf **mbufs); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.c b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.c new file mode 100644 index 00000000..9566b53f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.c @@ -0,0 +1,401 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_distributor_v20.h" +#include "rte_distributor_private.h" + +TAILQ_HEAD(rte_distributor_list, rte_distributor_v20); + +static struct rte_tailq_elem rte_distributor_tailq = { + .name = "RTE_DISTRIBUTOR", +}; +EAL_REGISTER_TAILQ(rte_distributor_tailq) + +/**** APIs called by workers ****/ + +void +rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d, + unsigned worker_id, struct rte_mbuf *oldpkt) +{ + union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id]; + int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS) + | RTE_DISTRIB_GET_BUF; + while (unlikely(buf->bufptr64 & RTE_DISTRIB_FLAGS_MASK)) + rte_pause(); + buf->bufptr64 = req; +} +VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0); + +struct rte_mbuf * +rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d, + unsigned worker_id) +{ + union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id]; + if (buf->bufptr64 & RTE_DISTRIB_GET_BUF) + return NULL; + + /* since bufptr64 is signed, this should be an arithmetic shift */ + int64_t ret = buf->bufptr64 >> RTE_DISTRIB_FLAG_BITS; + return (struct rte_mbuf *)((uintptr_t)ret); +} +VERSION_SYMBOL(rte_distributor_poll_pkt, _v20, 2.0); + +struct rte_mbuf * +rte_distributor_get_pkt_v20(struct rte_distributor_v20 *d, + unsigned worker_id, struct rte_mbuf *oldpkt) +{ + struct rte_mbuf *ret; + rte_distributor_request_pkt_v20(d, worker_id, oldpkt); + while ((ret = rte_distributor_poll_pkt_v20(d, worker_id)) == NULL) + rte_pause(); + return ret; +} +VERSION_SYMBOL(rte_distributor_get_pkt, _v20, 2.0); + +int +rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d, + unsigned worker_id, struct rte_mbuf *oldpkt) +{ + union rte_distributor_buffer_v20 *buf = &d->bufs[worker_id]; + uint64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS) + | RTE_DISTRIB_RETURN_BUF; + buf->bufptr64 = req; + return 0; +} +VERSION_SYMBOL(rte_distributor_return_pkt, _v20, 2.0); + +/**** APIs called on distributor core ***/ + +/* as name suggests, adds a packet to the backlog for a particular worker */ +static int +add_to_backlog(struct rte_distributor_backlog *bl, int64_t item) +{ + if (bl->count == RTE_DISTRIB_BACKLOG_SIZE) + return -1; + + bl->pkts[(bl->start + bl->count++) & (RTE_DISTRIB_BACKLOG_MASK)] + = item; + return 0; +} + +/* takes the next packet for a worker off the backlog */ +static int64_t +backlog_pop(struct rte_distributor_backlog *bl) +{ + bl->count--; + return bl->pkts[bl->start++ & RTE_DISTRIB_BACKLOG_MASK]; +} + +/* stores a packet returned from a worker inside the returns array */ +static inline void +store_return(uintptr_t oldbuf, struct rte_distributor_v20 *d, + unsigned *ret_start, unsigned *ret_count) +{ + /* store returns in a circular buffer - code is branch-free */ + d->returns.mbufs[(*ret_start + *ret_count) & RTE_DISTRIB_RETURNS_MASK] + = (void *)oldbuf; + *ret_start += (*ret_count == RTE_DISTRIB_RETURNS_MASK) & !!(oldbuf); + *ret_count += (*ret_count != RTE_DISTRIB_RETURNS_MASK) & !!(oldbuf); +} + +static inline void +handle_worker_shutdown(struct rte_distributor_v20 *d, unsigned int wkr) +{ + d->in_flight_tags[wkr] = 0; + d->in_flight_bitmask &= ~(1UL << wkr); + d->bufs[wkr].bufptr64 = 0; + if (unlikely(d->backlog[wkr].count != 0)) { + /* On return of a packet, we need to move the + * queued packets for this core elsewhere. + * Easiest solution is to set things up for + * a recursive call. That will cause those + * packets to be queued up for the next free + * core, i.e. it will return as soon as a + * core becomes free to accept the first + * packet, as subsequent ones will be added to + * the backlog for that core. + */ + struct rte_mbuf *pkts[RTE_DISTRIB_BACKLOG_SIZE]; + unsigned i; + struct rte_distributor_backlog *bl = &d->backlog[wkr]; + + for (i = 0; i < bl->count; i++) { + unsigned idx = (bl->start + i) & + RTE_DISTRIB_BACKLOG_MASK; + pkts[i] = (void *)((uintptr_t)(bl->pkts[idx] >> + RTE_DISTRIB_FLAG_BITS)); + } + /* recursive call. + * Note that the tags were set before first level call + * to rte_distributor_process. + */ + rte_distributor_process_v20(d, pkts, i); + bl->count = bl->start = 0; + } +} + +/* this function is called when process() fn is called without any new + * packets. It goes through all the workers and clears any returned packets + * to do a partial flush. + */ +static int +process_returns(struct rte_distributor_v20 *d) +{ + unsigned wkr; + unsigned flushed = 0; + unsigned ret_start = d->returns.start, + ret_count = d->returns.count; + + for (wkr = 0; wkr < d->num_workers; wkr++) { + + const int64_t data = d->bufs[wkr].bufptr64; + uintptr_t oldbuf = 0; + + if (data & RTE_DISTRIB_GET_BUF) { + flushed++; + if (d->backlog[wkr].count) + d->bufs[wkr].bufptr64 = + backlog_pop(&d->backlog[wkr]); + else { + d->bufs[wkr].bufptr64 = RTE_DISTRIB_GET_BUF; + d->in_flight_tags[wkr] = 0; + d->in_flight_bitmask &= ~(1UL << wkr); + } + oldbuf = data >> RTE_DISTRIB_FLAG_BITS; + } else if (data & RTE_DISTRIB_RETURN_BUF) { + handle_worker_shutdown(d, wkr); + oldbuf = data >> RTE_DISTRIB_FLAG_BITS; + } + + store_return(oldbuf, d, &ret_start, &ret_count); + } + + d->returns.start = ret_start; + d->returns.count = ret_count; + + return flushed; +} + +/* process a set of packets to distribute them to workers */ +int +rte_distributor_process_v20(struct rte_distributor_v20 *d, + struct rte_mbuf **mbufs, unsigned num_mbufs) +{ + unsigned next_idx = 0; + unsigned wkr = 0; + struct rte_mbuf *next_mb = NULL; + int64_t next_value = 0; + uint32_t new_tag = 0; + unsigned ret_start = d->returns.start, + ret_count = d->returns.count; + + if (unlikely(num_mbufs == 0)) + return process_returns(d); + + while (next_idx < num_mbufs || next_mb != NULL) { + + int64_t data = d->bufs[wkr].bufptr64; + uintptr_t oldbuf = 0; + + if (!next_mb) { + next_mb = mbufs[next_idx++]; + next_value = (((int64_t)(uintptr_t)next_mb) + << RTE_DISTRIB_FLAG_BITS); + /* + * User is advocated to set tag value for each + * mbuf before calling rte_distributor_process. + * User defined tags are used to identify flows, + * or sessions. + */ + new_tag = next_mb->hash.usr; + + /* + * Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 + * then the size of match has to be expanded. + */ + uint64_t match = 0; + unsigned i; + /* + * to scan for a match use "xor" and "not" to get a 0/1 + * value, then use shifting to merge to single "match" + * variable, where a one-bit indicates a match for the + * worker given by the bit-position + */ + for (i = 0; i < d->num_workers; i++) + match |= (!(d->in_flight_tags[i] ^ new_tag) + << i); + + /* Only turned-on bits are considered as match */ + match &= d->in_flight_bitmask; + + if (match) { + next_mb = NULL; + unsigned worker = __builtin_ctzl(match); + if (add_to_backlog(&d->backlog[worker], + next_value) < 0) + next_idx--; + } + } + + if ((data & RTE_DISTRIB_GET_BUF) && + (d->backlog[wkr].count || next_mb)) { + + if (d->backlog[wkr].count) + d->bufs[wkr].bufptr64 = + backlog_pop(&d->backlog[wkr]); + + else { + d->bufs[wkr].bufptr64 = next_value; + d->in_flight_tags[wkr] = new_tag; + d->in_flight_bitmask |= (1UL << wkr); + next_mb = NULL; + } + oldbuf = data >> RTE_DISTRIB_FLAG_BITS; + } else if (data & RTE_DISTRIB_RETURN_BUF) { + handle_worker_shutdown(d, wkr); + oldbuf = data >> RTE_DISTRIB_FLAG_BITS; + } + + /* store returns in a circular buffer */ + store_return(oldbuf, d, &ret_start, &ret_count); + + if (++wkr == d->num_workers) + wkr = 0; + } + /* to finish, check all workers for backlog and schedule work for them + * if they are ready */ + for (wkr = 0; wkr < d->num_workers; wkr++) + if (d->backlog[wkr].count && + (d->bufs[wkr].bufptr64 & RTE_DISTRIB_GET_BUF)) { + + int64_t oldbuf = d->bufs[wkr].bufptr64 >> + RTE_DISTRIB_FLAG_BITS; + store_return(oldbuf, d, &ret_start, &ret_count); + + d->bufs[wkr].bufptr64 = backlog_pop(&d->backlog[wkr]); + } + + d->returns.start = ret_start; + d->returns.count = ret_count; + return num_mbufs; +} +VERSION_SYMBOL(rte_distributor_process, _v20, 2.0); + +/* return to the caller, packets returned from workers */ +int +rte_distributor_returned_pkts_v20(struct rte_distributor_v20 *d, + struct rte_mbuf **mbufs, unsigned max_mbufs) +{ + struct rte_distributor_returned_pkts *returns = &d->returns; + unsigned retval = (max_mbufs < returns->count) ? + max_mbufs : returns->count; + unsigned i; + + for (i = 0; i < retval; i++) { + unsigned idx = (returns->start + i) & RTE_DISTRIB_RETURNS_MASK; + mbufs[i] = returns->mbufs[idx]; + } + returns->start += i; + returns->count -= i; + + return retval; +} +VERSION_SYMBOL(rte_distributor_returned_pkts, _v20, 2.0); + +/* return the number of packets in-flight in a distributor, i.e. packets + * being worked on or queued up in a backlog. + */ +static inline unsigned +total_outstanding(const struct rte_distributor_v20 *d) +{ + unsigned wkr, total_outstanding; + + total_outstanding = __builtin_popcountl(d->in_flight_bitmask); + + for (wkr = 0; wkr < d->num_workers; wkr++) + total_outstanding += d->backlog[wkr].count; + + return total_outstanding; +} + +/* flush the distributor, so that there are no outstanding packets in flight or + * queued up. */ +int +rte_distributor_flush_v20(struct rte_distributor_v20 *d) +{ + const unsigned flushed = total_outstanding(d); + + while (total_outstanding(d) > 0) + rte_distributor_process_v20(d, NULL, 0); + + return flushed; +} +VERSION_SYMBOL(rte_distributor_flush, _v20, 2.0); + +/* clears the internal returns array in the distributor */ +void +rte_distributor_clear_returns_v20(struct rte_distributor_v20 *d) +{ + d->returns.start = d->returns.count = 0; +#ifndef __OPTIMIZE__ + memset(d->returns.mbufs, 0, sizeof(d->returns.mbufs)); +#endif +} +VERSION_SYMBOL(rte_distributor_clear_returns, _v20, 2.0); + +/* creates a distributor instance */ +struct rte_distributor_v20 * +rte_distributor_create_v20(const char *name, + unsigned socket_id, + unsigned num_workers) +{ + struct rte_distributor_v20 *d; + struct rte_distributor_list *distributor_list; + char mz_name[RTE_MEMZONE_NAMESIZE]; + const struct rte_memzone *mz; + + /* compilation-time checks */ + RTE_BUILD_BUG_ON((sizeof(*d) & RTE_CACHE_LINE_MASK) != 0); + RTE_BUILD_BUG_ON((RTE_DISTRIB_MAX_WORKERS & 7) != 0); + RTE_BUILD_BUG_ON(RTE_DISTRIB_MAX_WORKERS > + sizeof(d->in_flight_bitmask) * CHAR_BIT); + + if (name == NULL || num_workers >= RTE_DISTRIB_MAX_WORKERS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mz_name, sizeof(mz_name), RTE_DISTRIB_PREFIX"%s", name); + mz = rte_memzone_reserve(mz_name, sizeof(*d), socket_id, NO_FLAGS); + if (mz == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + d = mz->addr; + snprintf(d->name, sizeof(d->name), "%s", name); + d->num_workers = num_workers; + + distributor_list = RTE_TAILQ_CAST(rte_distributor_tailq.head, + rte_distributor_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_INSERT_TAIL(distributor_list, d, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return d; +} +VERSION_SYMBOL(rte_distributor_create, _v20, 2.0); diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.h b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.h new file mode 100644 index 00000000..12865658 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_v20.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_DISTRIB_V20_H_ +#define _RTE_DISTRIB_V20_H_ + +/** + * @file + * RTE distributor + * + * The distributor is a component which is designed to pass packets + * one-at-a-time to workers, with dynamic load balancing. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_DISTRIBUTOR_NAMESIZE 32 /**< Length of name for instance */ + +struct rte_distributor_v20; +struct rte_mbuf; + +/** + * Function to create a new distributor instance + * + * Reserves the memory needed for the distributor operation and + * initializes the distributor to work with the configured number of workers. + * + * @param name + * The name to be given to the distributor instance. + * @param socket_id + * The NUMA node on which the memory is to be allocated + * @param num_workers + * The maximum number of workers that will request packets from this + * distributor + * @return + * The newly created distributor instance + */ +struct rte_distributor_v20 * +rte_distributor_create_v20(const char *name, unsigned int socket_id, + unsigned int num_workers); + +/* *** APIS to be called on the distributor lcore *** */ +/* + * The following APIs are the public APIs which are designed for use on a + * single lcore which acts as the distributor lcore for a given distributor + * instance. These functions cannot be called on multiple cores simultaneously + * without using locking to protect access to the internals of the distributor. + * + * NOTE: a given lcore cannot act as both a distributor lcore and a worker lcore + * for the same distributor instance, otherwise deadlock will result. + */ + +/** + * Process a set of packets by distributing them among workers that request + * packets. The distributor will ensure that no two packets that have the + * same flow id, or tag, in the mbuf will be processed at the same time. + * + * The user is advocated to set tag for each mbuf before calling this function. + * If user doesn't set the tag, the tag value can be various values depending on + * driver implementation and configuration. + * + * This is not multi-thread safe and should only be called on a single lcore. + * + * @param d + * The distributor instance to be used + * @param mbufs + * The mbufs to be distributed + * @param num_mbufs + * The number of mbufs in the mbufs array + * @return + * The number of mbufs processed. + */ +int +rte_distributor_process_v20(struct rte_distributor_v20 *d, + struct rte_mbuf **mbufs, unsigned int num_mbufs); + +/** + * Get a set of mbufs that have been returned to the distributor by workers + * + * This should only be called on the same lcore as rte_distributor_process() + * + * @param d + * The distributor instance to be used + * @param mbufs + * The mbufs pointer array to be filled in + * @param max_mbufs + * The size of the mbufs array + * @return + * The number of mbufs returned in the mbufs array. + */ +int +rte_distributor_returned_pkts_v20(struct rte_distributor_v20 *d, + struct rte_mbuf **mbufs, unsigned int max_mbufs); + +/** + * Flush the distributor component, so that there are no in-flight or + * backlogged packets awaiting processing + * + * This should only be called on the same lcore as rte_distributor_process() + * + * @param d + * The distributor instance to be used + * @return + * The number of queued/in-flight packets that were completed by this call. + */ +int +rte_distributor_flush_v20(struct rte_distributor_v20 *d); + +/** + * Clears the array of returned packets used as the source for the + * rte_distributor_returned_pkts() API call. + * + * This should only be called on the same lcore as rte_distributor_process() + * + * @param d + * The distributor instance to be used + */ +void +rte_distributor_clear_returns_v20(struct rte_distributor_v20 *d); + +/* *** APIS to be called on the worker lcores *** */ +/* + * The following APIs are the public APIs which are designed for use on + * multiple lcores which act as workers for a distributor. Each lcore should use + * a unique worker id when requesting packets. + * + * NOTE: a given lcore cannot act as both a distributor lcore and a worker lcore + * for the same distributor instance, otherwise deadlock will result. + */ + +/** + * API called by a worker to get a new packet to process. Any previous packet + * given to the worker is assumed to have completed processing, and may be + * optionally returned to the distributor via the oldpkt parameter. + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param oldpkt + * The previous packet, if any, being processed by the worker + * + * @return + * A new packet to be processed by the worker thread. + */ +struct rte_mbuf * +rte_distributor_get_pkt_v20(struct rte_distributor_v20 *d, + unsigned int worker_id, struct rte_mbuf *oldpkt); + +/** + * API called by a worker to return a completed packet without requesting a + * new packet, for example, because a worker thread is shutting down + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param mbuf + * The previous packet being processed by the worker + */ +int +rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d, + unsigned int worker_id, struct rte_mbuf *mbuf); + +/** + * API called by a worker to request a new packet to process. + * Any previous packet given to the worker is assumed to have completed + * processing, and may be optionally returned to the distributor via + * the oldpkt parameter. + * Unlike rte_distributor_get_pkt(), this function does not wait for a new + * packet to be provided by the distributor. + * + * NOTE: after calling this function, rte_distributor_poll_pkt() should + * be used to poll for the packet requested. The rte_distributor_get_pkt() + * API should *not* be used to try and retrieve the new packet. + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * @param oldpkt + * The previous packet, if any, being processed by the worker + */ +void +rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d, + unsigned int worker_id, struct rte_mbuf *oldpkt); + +/** + * API called by a worker to check for a new packet that was previously + * requested by a call to rte_distributor_request_pkt(). It does not wait + * for the new packet to be available, but returns NULL if the request has + * not yet been fulfilled by the distributor. + * + * @param d + * The distributor instance to be used + * @param worker_id + * The worker instance number to use - must be less that num_workers passed + * at distributor creation time. + * + * @return + * A new packet to be processed by the worker thread, or NULL if no + * packet is yet available. + */ +struct rte_mbuf * +rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d, + unsigned int worker_id); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_distributor/rte_distributor_version.map b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_version.map new file mode 100644 index 00000000..3a285b39 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_distributor/rte_distributor_version.map @@ -0,0 +1,29 @@ +DPDK_2.0 { + global: + + rte_distributor_clear_returns; + rte_distributor_create; + rte_distributor_flush; + rte_distributor_get_pkt; + rte_distributor_poll_pkt; + rte_distributor_process; + rte_distributor_request_pkt; + rte_distributor_return_pkt; + rte_distributor_returned_pkts; + + local: *; +}; + +DPDK_17.05 { + global: + + rte_distributor_clear_returns; + rte_distributor_create; + rte_distributor_flush; + rte_distributor_get_pkt; + rte_distributor_poll_pkt; + rte_distributor_process; + rte_distributor_request_pkt; + rte_distributor_return_pkt; + rte_distributor_returned_pkts; +} DPDK_2.0; diff --git a/src/spdk/dpdk/lib/librte_eal/Makefile b/src/spdk/dpdk/lib/librte_eal/Makefile new file mode 100644 index 00000000..ccd45cb8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-y += common +DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp +DEPDIRS-linuxapp := common +DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += bsdapp +DEPDIRS-bsdapp := common + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/BSDmakefile.meson b/src/spdk/dpdk/lib/librte_eal/bsdapp/BSDmakefile.meson new file mode 100644 index 00000000..42f5b2b9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/BSDmakefile.meson @@ -0,0 +1,43 @@ +# BSD LICENSE +# +# Copyright(c) 2017 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# makefile for building kernel modules using meson +# takes parameters from the environment + +# source file is passed via KMOD_SRC as full path, we only use final +# component of it, as VPATH is used to find actual file, so as to +# have the .o files placed in the build, not source directory +VPATH = ${KMOD_SRC:H} +SRCS = ${KMOD_SRC:T} device_if.h bus_if.h pci_if.h +CFLAGS += $(KMOD_CFLAGS) + +.include diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/Makefile b/src/spdk/dpdk/lib/librte_eal/bsdapp/Makefile new file mode 100644 index 00000000..5b06b216 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/Makefile b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/Makefile new file mode 100644 index 00000000..d27da3d1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/Makefile @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2015 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) +VPATH += $(RTE_SDK)/lib/librte_eal/common +VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -lexecinfo +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrte_kvargs + +EXPORT_MAP := ../../rte_eal_version.map + +LIBABIVER := 8 + +# specific to bsdapp exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +CFLAGS_eal.o := -D_GNU_SOURCE +#CFLAGS_eal_thread.o := -D_GNU_SOURCE +CFLAGS_eal_log.o := -D_GNU_SOURCE +CFLAGS_eal_common_log.o := -D_GNU_SOURCE + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +CFLAGS_eal_hpet.o += -Wno-return-type +endif + +INC := # no bsdapp specific headers + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_BSDAPP)-include/exec-env := \ + $(addprefix include/exec-env/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal.c new file mode 100644 index 00000000..d7ae9d68 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal.c @@ -0,0 +1,941 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, internal_config.hugefile_prefix); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +const char * +eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static void +rte_eal_config_create(void) +{ + void *rte_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); + if (retval < 0){ + close(mem_cfg_fd); + rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + } + + rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config), + PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); + + if (rte_mem_cfg_addr == MAP_FAILED){ + rte_panic("Cannot mmap memory for rte_config\n"); + } + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; +} + +/* attach to an existing shared memory config */ +static void +rte_eal_config_attach(void) +{ + void *rte_mem_cfg_addr; + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config), + PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); + close(mem_cfg_fd); + if (rte_mem_cfg_addr == MAP_FAILED) + rte_panic("Cannot mmap memory for rte_config\n"); + + rte_config.mem_config = rte_mem_cfg_addr; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static void +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + rte_eal_config_create(); + break; + case RTE_PROC_SECONDARY: + rte_eal_config_attach(); + rte_eal_mcfg_wait_complete(rte_config.mem_config); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + rte_panic("Invalid process type\n"); + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + const int old_optreset = optreset; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + optreset = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optreset = old_optreset; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + const int old_optreset = optreset; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + optreset = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* getopt is not happy, stop right now */ + if (opt == '?') { + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case OPT_MBUF_POOL_OPS_NAME_NUM: + internal_config.user_mbuf_pool_ops_name = + strdup(optarg); + break; + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on FreeBSD\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on FreeBSD\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on FreeBSD\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optreset = old_optreset; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0) + return 1; + + return 0; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +inline static void +rte_eal_mcfg_complete(void) +{ + /* ALL shared mem_config related INIT DONE */ + if (rte_config.process_type == RTE_PROC_PRIMARY) + rte_config.mem_config->magic = RTE_MAGIC; +} + +/* return non-zero if hugepages are enabled. */ +int rte_eal_has_hugepages(void) +{ + return !internal_config.no_hugetlbfs; +} + +/* Abstraction for port I/0 privilege */ +int +rte_eal_iopl_init(void) +{ + static int fd; + + fd = open("/dev/io", O_RDWR); + if (fd < 0) + return -1; + /* keep fd open for iopl */ + return 0; +} + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + /* FreeBSD always uses legacy memory model */ + internal_config.legacy_mem = true; + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins\n"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + rte_config_init(); + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + return -1; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0) { + rte_eal_init_alert("failed to init mp channel\n"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + /* autodetect the iova mapping mode (default is iova_pa) */ + rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + else + internal_config.memory = eal_get_hugepage_mem_size(); + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + rte_srand(rte_rdtsc()); + + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone\n"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory\n"); + rte_errno = ENOMEM; + return -1; + } + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap\n"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n", + rte_config.master_lcore, thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + rte_thread_setname(lcore_config[i].thread_id, thread_name); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed\n"); + rte_errno = ENOEXEC; + return -1; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices\n"); + rte_errno = ENOTSUP; + return -1; + } + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + return -1; + } + + rte_eal_mcfg_complete(); + + return fctret; +} + +int __rte_experimental +rte_eal_cleanup(void) +{ + rte_service_finalize(); + return 0; +} + +/* get core role */ +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned lcore_id) +{ + return rte_config.lcore_role[lcore_id]; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return RTE_INTR_MODE_NONE; +} + +int rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int fd) +{ + return -1; +} + +int rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return 0; +} + +int rte_vfio_noiommu_is_enabled(void) +{ + return 0; +} + +int rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return 0; +} + +int +rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm.c new file mode 100644 index 00000000..51ea4b8c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm.c @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_alarm_private.h" + +#define NS_PER_US 1000 + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct rte_intr_handle handle; + struct timespec time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + + /* on FreeBSD, timers don't use fd's, and their identifiers are stored + * in separate namespace from fd's, so using any value is OK. however, + * EAL interrupts handler expects fd's to be unique, so use an actual fd + * to guarantee unique timer identifier. + */ + intr_handle.fd = open("/dev/zero", O_RDONLY); + + return 0; +} + +static inline int +timespec_cmp(const struct timespec *now, const struct timespec *at) +{ + if (now->tv_sec < at->tv_sec) + return -1; + if (now->tv_sec > at->tv_sec) + return 1; + if (now->tv_nsec < at->tv_nsec) + return -1; + if (now->tv_nsec > at->tv_nsec) + return 1; + return 0; +} + +static inline uint64_t +diff_ns(struct timespec *now, struct timespec *at) +{ + uint64_t now_ns, at_ns; + + if (timespec_cmp(now, at) >= 0) + return 0; + + now_ns = now->tv_sec * NS_PER_S + now->tv_nsec; + at_ns = at->tv_sec * NS_PER_S + at->tv_nsec; + + return at_ns - now_ns; +} + +int +eal_alarm_get_timeout_ns(uint64_t *val) +{ + struct alarm_entry *ap; + struct timespec now; + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return -1; + + if (LIST_EMPTY(&alarm_list)) + return -1; + + ap = LIST_FIRST(&alarm_list); + + *val = diff_ns(&now, &ap->time); + + return 0; +} + +static int +unregister_current_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + do { + ret = rte_intr_callback_unregister(&intr_handle, + eal_alarm_callback, &ap->time); + } while (ret == -EAGAIN); + } + + return ret; +} + +static int +register_first_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + /* register a new callback */ + ret = rte_intr_callback_register(&intr_handle, + eal_alarm_callback, &ap->time); + } + return ret; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + ap = LIST_FIRST(&alarm_list); + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return; + + while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + + ap = LIST_FIRST(&alarm_list); + } + + /* timer has been deleted from the kqueue, so recreate it if needed */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); +} + + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *new_alarm; + struct timespec now; + uint64_t ns; + int ret = 0; + + /* check parameters, also ensure us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + ns = us * NS_PER_US; + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S; + new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (timespec_cmp(&new_alarm->time, &ap->time) < 0) { + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + /* re-register first callback just in case */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while (1) { + ap = LIST_FIRST(&alarm_list); + if (ap == NULL) + break; + if (cb_fn != ap->cb_fn) + break; + if (cb_arg != ap->cb_arg && cb_arg != (void *) -1) + break; + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that + * alarm is executing so loop can spin till it + * finish. Otherwise we are trying to cancel + * ourselves - mark it by EINPROGRESS. + */ + if (pthread_equal(ap->executing_id, + pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || + cb_arg == ap->cb_arg)) { + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, + pthread_self()) == 0) { + executing++; + } else { + err = EINPROGRESS; + } + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + rte_spinlock_lock(&alarm_list_lk); + + /* unregister if no alarms left, otherwise re-register first */ + if (LIST_EMPTY(&alarm_list)) + unregister_current_callback(); + else + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + return count; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm_private.h b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm_private.h new file mode 100644 index 00000000..65c71151 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_alarm_private.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef EAL_ALARM_PRIVATE_H +#define EAL_ALARM_PRIVATE_H + +#include + +/* + * FreeBSD needs a back-channel communication mechanism between interrupt and + * alarm thread, because on FreeBSD, timer period is set up inside the interrupt + * API and not inside alarm API like on Linux. + */ + +int +eal_alarm_get_timeout_ns(uint64_t *val); + +#endif // EAL_ALARM_PRIVATE_H diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_cpuflags.c new file mode 100644 index 00000000..69b161ea --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_cpuflags.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include +#include + +unsigned long +rte_cpu_getauxval(unsigned long type __rte_unused) +{ + /* not implemented */ + return 0; +} + +int +rte_cpu_strcmp_auxval(unsigned long type __rte_unused, + const char *str __rte_unused) +{ + /* not implemented */ + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_debug.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_debug.c new file mode 100644 index 00000000..5d92500b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_dev.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_dev.c new file mode 100644 index 00000000..1c6c51bd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_dev.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include + +int __rte_experimental +rte_dev_event_monitor_start(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int __rte_experimental +rte_dev_event_monitor_stop(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c new file mode 100644 index 00000000..1e8f5df2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include + +#include +#include +#include "eal_hugepages.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" + +#define CONTIGMEM_DEV "/dev/contigmem" + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +/* + * No hugepage support on freebsd, but we dummy it, using contigmem driver + */ +int +eal_hugepage_info_init(void) +{ + size_t sysctl_size; + int num_buffers, fd, error; + int64_t buffer_size; + /* re-use the linux "internal config" structure for our memory data */ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + unsigned int i; + + internal_config.num_hugepage_sizes = 1; + + sysctl_size = sizeof(num_buffers); + error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers, + &sysctl_size, NULL, 0); + + if (error != 0) { + RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n"); + return -1; + } + + sysctl_size = sizeof(buffer_size); + error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size, + &sysctl_size, NULL, 0); + + if (error != 0) { + RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n"); + return -1; + } + + fd = open(CONTIGMEM_DEV, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "could not open "CONTIGMEM_DEV"\n"); + return -1; + } + + if (buffer_size >= 1<<30) + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dGB\n", + num_buffers, (int)(buffer_size>>30)); + else if (buffer_size >= 1<<20) + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dMB\n", + num_buffers, (int)(buffer_size>>20)); + else + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n", + num_buffers, (int)(buffer_size>>10)); + + strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir)); + hpi->hugepage_sz = buffer_size; + hpi->num_pages[0] = num_buffers; + hpi->lock_descriptor = fd; + + /* for no shared files mode, do not create shared memory config */ + if (internal_config.no_shconf) + return 0; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL ) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + + return 0; +} + +/* copy stuff from shared info into internal config */ +int +eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + internal_config.num_hugepage_sizes = 1; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_interrupts.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_interrupts.c new file mode 100644 index 00000000..2feee2d5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_interrupts.c @@ -0,0 +1,561 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_alarm_private.h" + +#define MAX_INTR_EVENTS 16 + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +static volatile int kq = -1; + +static int +intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke) +{ + /* alarm callbacks are special case */ + if (ih->type == RTE_INTR_HANDLE_ALARM) { + uint64_t timeout_ns; + + /* get soonest alarm timeout */ + if (eal_alarm_get_timeout_ns(&timeout_ns) < 0) + return -1; + + ke->filter = EVFILT_TIMER; + /* timers are one shot */ + ke->flags |= EV_ONESHOT; + ke->fflags = NOTE_NSECONDS; + ke->data = timeout_ns; + } else { + ke->filter = EVFILT_READ; + } + ke->ident = ih->fd; + + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + struct rte_intr_callback *callback = NULL; + struct rte_intr_source *src = NULL; + int ret, add_event; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq); + return -ENODEV; + } + + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + add_event = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + add_event = 1; + ret = 0; + } + } + + /* add events to the queue. timer events are special as we need to + * re-set the timer. + */ + if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) { + struct kevent ke; + + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_ADD; /* mark for addition to the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n"); + ret = -ENODEV; + goto fail; + } + + /** + * add the intr file descriptor into wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + /* currently, nic_uio does not support interrupts, so + * this error will always be triggered and output to the + * user. so, don't output it unless debug log level set. + */ + if (errno == ENODEV) + RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n", + src->intr_handle.fd); + else + RTE_LOG(ERR, EAL, "Error adding fd %d " + "kevent, %s\n", + src->intr_handle.fd, + strerror(errno)); + ret = -errno; + goto fail; + } + } + rte_spinlock_unlock(&intr_lock); + + return ret; +fail: + /* clean up */ + if (src != NULL) { + TAILQ_REMOVE(&(src->callbacks), callback, next); + if (TAILQ_EMPTY(&(src->callbacks))) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + free(callback); + rte_spinlock_unlock(&intr_lock); + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active\n"); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + struct kevent ke; + + ret = 0; + + /* remove it from the kqueue */ + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_DELETE; /* mark for deletion from the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); + ret = -ENODEV; + goto out; + } + + /** + * remove intr file descriptor from wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n", + src->intr_handle.fd, strerror(errno)); + /* removing non-existent even is an expected condition + * in some circumstances (e.g. oneshot events). + */ + } + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } +out: + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static void +eal_intr_process_interrupts(struct kevent *events, int nfds) +{ + struct rte_intr_callback active_cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback *cb; + struct rte_intr_source *src; + bool call = false; + int n, bytes_read; + + for (n = 0; n < nfds; n++) { + int event_fd = events[n].ident; + + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == event_fd) + break; + if (src == NULL) { + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_ALARM: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(event_fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + event_fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", event_fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + rte_spinlock_unlock(&intr_lock); + } +} + +static void * +eal_intr_thread_main(void *arg __rte_unused) +{ + struct kevent events[MAX_INTR_EVENTS]; + int nfds; + + /* host thread, never break out */ + for (;;) { + /* do not change anything, just wait */ + nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL); + + /* kevent fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "kevent returns with fail\n"); + break; + } + /* kevent timeout, will never happen here */ + else if (nfds == 0) + continue; + + /* kevent has at least one fd ready to read */ + eal_intr_process_interrupts(events, nfds); + } + close(kq); + kq = -1; + return NULL; +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + kq = kqueue(); + if (kq < 0) { + RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n"); + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data) +{ + RTE_SET_USED(intr_handle); + RTE_SET_USED(epfd); + RTE_SET_USED(op); + RTE_SET_USED(vec); + RTE_SET_USED(data); + + return -ENOTSUP; +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + RTE_SET_USED(intr_handle); + RTE_SET_USED(nb_efd); + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 0; +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 1; +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 0; +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + RTE_SET_USED(epfd); + RTE_SET_USED(events); + RTE_SET_USED(maxevents); + RTE_SET_USED(timeout); + + return -ENOTSUP; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, struct rte_epoll_event *event) +{ + RTE_SET_USED(epfd); + RTE_SET_USED(op); + RTE_SET_USED(fd); + RTE_SET_USED(event); + + return -ENOTSUP; +} + +int +rte_intr_tls_epfd(void) +{ + return -ENOTSUP; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_lcore.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_lcore.c new file mode 100644 index 00000000..d9ef4bc9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_lcore.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +/* No topology information available on FreeBSD including NUMA info */ +unsigned +eal_cpu_core_id(__rte_unused unsigned lcore_id) +{ + return 0; +} + +static int +eal_get_ncpus(void) +{ + static int ncpu = -1; + int mib[2] = {CTL_HW, HW_NCPU}; + size_t len = sizeof(ncpu); + + if (ncpu < 0) { + sysctl(mib, 2, &ncpu, &len, NULL, 0); + RTE_LOG(INFO, EAL, "Sysctl reports %d cpus\n", ncpu); + } + return ncpu; +} + +unsigned +eal_cpu_socket_id(__rte_unused unsigned cpu_id) +{ + return 0; +} + +/* Check if a cpu is present by the presence of the + * cpu information for it. + */ +int +eal_cpu_detected(unsigned lcore_id) +{ + const unsigned ncpus = eal_get_ncpus(); + return lcore_id < ncpus; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memalloc.c new file mode 100644 index 00000000..f7f07abd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memalloc.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include + +#include +#include + +#include "eal_memalloc.h" + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused, + int __rte_unused n_segs, size_t __rte_unused page_sz, + int __rte_unused socket, bool __rte_unused exact) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return NULL; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused, + int n_segs __rte_unused) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_sync_with_primary(void) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_init(void) +{ + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memory.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memory.c new file mode 100644 index 00000000..16d2bc7c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_memory.c @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" + +#define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE)) + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + /* XXX not implemented. This function is only used by + * rte_mempool_virt2iova() when hugepages are disabled. */ + (void)virtaddr; + return RTE_BAD_IOVA; +} +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + return rte_mem_virt2phy(virtaddr); +} + +int +rte_eal_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + uint64_t total_mem = 0; + void *addr; + unsigned int i, j, seg_idx = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* for debug purposes, hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + struct rte_memseg *ms; + uint64_t page_sz; + int n_segs, cur_seg; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + addr = mmap(NULL, internal_config.memory, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; + + /* populate memsegs. each memseg is 1 page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->socket_id = 0; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, page_sz); + } + return 0; + } + + /* map all hugepages and sort them */ + for (i = 0; i < internal_config.num_hugepage_sizes; i ++){ + struct hugepage_info *hpi; + rte_iova_t prev_end = 0; + int prev_ms_idx = -1; + uint64_t page_sz, mem_needed; + unsigned int n_pages, max_pages; + + hpi = &internal_config.hugepage_info[i]; + page_sz = hpi->hugepage_sz; + max_pages = hpi->num_pages[0]; + mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem, + page_sz); + + n_pages = RTE_MIN(mem_needed / page_sz, max_pages); + + for (j = 0; j < n_pages; j++) { + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + struct rte_memseg *seg; + int msl_idx, ms_idx; + rte_iova_t physaddr; + int error; + size_t sysctl_size = sizeof(physaddr); + char physaddr_str[64]; + bool is_adjacent; + + /* first, check if this segment is IOVA-adjacent to + * the previous one. + */ + snprintf(physaddr_str, sizeof(physaddr_str), + "hw.contigmem.physaddr.%d", j); + error = sysctlbyname(physaddr_str, &physaddr, + &sysctl_size, NULL, 0); + if (error < 0) { + RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u " + "from %s\n", j, hpi->hugedir); + return -1; + } + + is_adjacent = prev_end != 0 && physaddr == prev_end; + prev_end = physaddr + hpi->hugepage_sz; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + bool empty, need_hole; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + + empty = arr->count == 0; + + /* we need a hole if this isn't an empty memseg + * list, and if previous segment was not + * adjacent to current one. + */ + need_hole = !empty && !is_adjacent; + + /* we need 1, plus hole if not adjacent */ + ms_idx = rte_fbarray_find_next_n_free(arr, + 0, 1 + (need_hole ? 1 : 0)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + if (need_hole && prev_ms_idx == ms_idx - 1) + ms_idx++; + prev_ms_idx = ms_idx; + + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + arr = &msl->memseg_arr; + seg = rte_fbarray_get(arr, ms_idx); + + addr = RTE_PTR_ADD(msl->base_va, + (size_t)msl->page_sz * ms_idx); + + /* address is already mapped in memseg list, so using + * MAP_FIXED here is safe. + */ + addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED, + hpi->lock_descriptor, + j * EAL_PAGE_SIZE); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", + j, hpi->hugedir); + return -1; + } + + seg->addr = addr; + seg->iova = physaddr; + seg->hugepage_sz = page_sz; + seg->len = page_sz; + seg->nchannel = mcfg->nchannel; + seg->nrank = mcfg->nrank; + seg->socket_id = 0; + + rte_fbarray_set_used(arr, ms_idx); + + RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%" + PRIx64", len %zu\n", + seg_idx++, addr, physaddr, page_sz); + + total_mem += seg->len; + } + if (total_mem >= internal_config.memory) + break; + } + if (total_mem < internal_config.memory) { + RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, " + "requested: %" PRIu64 "M " + "available: %" PRIu64 "M\n", + internal_config.memory >> 20, total_mem >> 20); + return -1; + } + return 0; +} + +struct attach_walk_args { + int fd_hugepage; + int seg_idx; +}; +static int +attach_segment(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct attach_walk_args *wa = arg; + void *addr; + + addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, wa->fd_hugepage, + wa->seg_idx * EAL_PAGE_SIZE); + if (addr == MAP_FAILED || addr != ms->addr) + return -1; + wa->seg_idx++; + + return 0; +} + +int +rte_eal_hugepage_attach(void) +{ + const struct hugepage_info *hpi; + int fd_hugepage = -1; + unsigned int i; + + hpi = &internal_config.hugepage_info[0]; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + const struct hugepage_info *cur_hpi = &hpi[i]; + struct attach_walk_args wa; + + memset(&wa, 0, sizeof(wa)); + + /* Obtain a file descriptor for contiguous memory */ + fd_hugepage = open(cur_hpi->hugedir, O_RDWR); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + cur_hpi->hugedir); + goto error; + } + wa.fd_hugepage = fd_hugepage; + wa.seg_idx = 0; + + /* Map the contiguous memory into each memory segment */ + if (rte_memseg_walk(attach_segment, &wa) < 0) { + RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", + wa.seg_idx, cur_hpi->hugedir); + goto error; + } + + close(fd_hugepage); + fd_hugepage = -1; + } + + /* hugepage_info is no longer required */ + return 0; + +error: + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +int +rte_eal_using_phys_addrs(void) +{ + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + + return 0; +} + + +static int +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* FreeBSD has an issue where core dump will dump the entire memory + * contents, including anonymous zero-page memory. Therefore, while we + * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will + * also be further limiting total memory amount to whatever memory is + * available to us through contigmem driver (plus spacing blocks). + * + * so, at each stage, we will be checking how much memory we are + * preallocating, and adjust all the values accordingly. + */ + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t max_type_mem, total_type_mem = 0; + uint64_t avail_mem; + int type_msl_idx, max_segs, avail_segs, total_segs = 0; + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* no NUMA support on FreeBSD */ + + /* check if we've already exceeded total memory amount */ + if (total_mem >= max_mem) + break; + + /* first, calculate theoretical limits according to config */ + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + /* now, limit all of that to whatever will actually be + * available to us, because without dynamic allocation support, + * all of that extra memory will be sitting there being useless + * and slowing down core dumps in case of a crash. + * + * we need (N*2)-1 segments because we cannot guarantee that + * each segment will be IOVA-contiguous with the previous one, + * so we will allocate more and put spaces inbetween segments + * that are non-contiguous. + */ + avail_segs = (hpi->num_pages[0] * 2) - 1; + avail_mem = avail_segs * hugepage_sz; + + max_type_mem = RTE_MIN(avail_mem, max_type_mem); + max_segs = RTE_MIN(avail_segs, max_segs); + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem, cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + + cur_mem = get_mem_amount(hugepage_sz, + cur_max_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + 0, type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + return 0; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + return rte_eal_process_type() == RTE_PROC_PRIMARY ? + memseg_primary_init() : + memseg_secondary_init(); +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_thread.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_thread.c new file mode 100644 index 00000000..309b5872 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_thread.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n", + lcore_id, thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + long lwpid; + thr_self(&lwpid); + return (int)lwpid; +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + /* this BSD function returns no error */ + pthread_set_name_np(id, name); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_timer.c b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_timer.c new file mode 100644 index 00000000..beff755a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/eal_timer.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +#ifdef RTE_LIBEAL_USE_HPET +#warning HPET is not supported in FreeBSD +#endif + +enum timer_source eal_timer_source = EAL_TIMER_TSC; + +uint64_t +get_tsc_freq(void) +{ + size_t sz; + int tmp; + uint64_t tsc_hz; + + sz = sizeof(tmp); + tmp = 0; + + if (sysctlbyname("kern.timecounter.smp_tsc", &tmp, &sz, NULL, 0)) + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + else if (tmp != 1) + RTE_LOG(WARNING, EAL, "TSC is not safe to use in SMP mode\n"); + + tmp = 0; + + if (sysctlbyname("kern.timecounter.invariant_tsc", &tmp, &sz, NULL, 0)) + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + else if (tmp != 1) + RTE_LOG(WARNING, EAL, "TSC is not invariant\n"); + + sz = sizeof(tsc_hz); + if (sysctlbyname("machdep.tsc_freq", &tsc_hz, &sz, NULL, 0)) { + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + return 0; + } + + return tsc_hz; +} + +int +rte_eal_timer_init(void) +{ + set_tsc_freq(); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/meson.build b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/meson.build new file mode 100644 index 00000000..3945b529 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/bsdapp/eal/meson.build @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +env_objs = [] +env_headers = [] +env_sources = files('eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_lcore.c', + 'eal_memalloc.c', + 'eal_thread.c', + 'eal_timer.c', + 'eal.c', + 'eal_memory.c', + 'eal_dev.c' +) + +deps += ['kvargs'] diff --git a/src/spdk/dpdk/lib/librte_eal/common/Makefile b/src/spdk/dpdk/lib/librte_eal/common/Makefile new file mode 100644 index 00000000..cca68826 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/Makefile @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +INC := rte_branch_prediction.h rte_common.h +INC += rte_debug.h rte_eal.h rte_eal_interrupts.h +INC += rte_errno.h rte_launch.h rte_lcore.h +INC += rte_log.h rte_memory.h rte_memzone.h +INC += rte_per_lcore.h rte_random.h +INC += rte_tailq.h rte_interrupts.h rte_alarm.h +INC += rte_string_fns.h rte_version.h +INC += rte_eal_memconfig.h rte_malloc_heap.h +INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h +INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h +INC += rte_malloc.h rte_keepalive.h rte_time.h +INC += rte_service.h rte_service_component.h +INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h +INC += rte_reciprocal.h rte_fbarray.h rte_uuid.h + +GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h +GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h +GENERIC_INC += rte_vect.h rte_pause.h rte_io.h + +# defined in mk/arch/$(RTE_ARCH)/rte.vars.mk +ARCH_DIR ?= $(RTE_ARCH) +ARCH_INC := $(sort $(notdir $(wildcard $(RTE_SDK)/lib/librte_eal/common/include/arch/$(ARCH_DIR)/*.h))) + +SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include := $(addprefix include/,$(INC)) +SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include += \ + $(addprefix include/arch/$(ARCH_DIR)/,$(ARCH_INC)) +SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include/generic := \ + $(addprefix include/generic/,$(GENERIC_INC)) + +include $(RTE_SDK)/mk/rte.install.mk diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/arm/meson.build b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/meson.build new file mode 100644 index 00000000..c6bd9227 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation. + +eal_common_arch_sources = files('rte_cpuflags.c', + 'rte_cycles.c') diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cpuflags.c new file mode 100644 index 00000000..caf3dc83 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cpuflags.c @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) Cavium, Inc. 2015. + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#include "rte_cpuflags.h" + +#include +#include +#include +#include +#include + +#ifndef AT_HWCAP +#define AT_HWCAP 16 +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#ifndef AT_PLATFORM +#define AT_PLATFORM 15 +#endif + +enum cpu_register_t { + REG_NONE = 0, + REG_HWCAP, + REG_HWCAP2, + REG_PLATFORM, + REG_MAX +}; + +typedef uint32_t hwcap_registers_t[REG_MAX]; + +/** + * Struct to hold a processor feature entry + */ +struct feature_entry { + uint32_t reg; + uint32_t bit; +#define CPU_FLAG_NAME_MAX_LEN 64 + char name[CPU_FLAG_NAME_MAX_LEN]; +}; + +#define FEAT_DEF(name, reg, bit) \ + [RTE_CPUFLAG_##name] = {reg, bit, #name}, + +#ifdef RTE_ARCH_ARMv7 +#define PLATFORM_STR "v7l" +typedef Elf32_auxv_t _Elfx_auxv_t; + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(SWP, REG_HWCAP, 0) + FEAT_DEF(HALF, REG_HWCAP, 1) + FEAT_DEF(THUMB, REG_HWCAP, 2) + FEAT_DEF(A26BIT, REG_HWCAP, 3) + FEAT_DEF(FAST_MULT, REG_HWCAP, 4) + FEAT_DEF(FPA, REG_HWCAP, 5) + FEAT_DEF(VFP, REG_HWCAP, 6) + FEAT_DEF(EDSP, REG_HWCAP, 7) + FEAT_DEF(JAVA, REG_HWCAP, 8) + FEAT_DEF(IWMMXT, REG_HWCAP, 9) + FEAT_DEF(CRUNCH, REG_HWCAP, 10) + FEAT_DEF(THUMBEE, REG_HWCAP, 11) + FEAT_DEF(NEON, REG_HWCAP, 12) + FEAT_DEF(VFPv3, REG_HWCAP, 13) + FEAT_DEF(VFPv3D16, REG_HWCAP, 14) + FEAT_DEF(TLS, REG_HWCAP, 15) + FEAT_DEF(VFPv4, REG_HWCAP, 16) + FEAT_DEF(IDIVA, REG_HWCAP, 17) + FEAT_DEF(IDIVT, REG_HWCAP, 18) + FEAT_DEF(VFPD32, REG_HWCAP, 19) + FEAT_DEF(LPAE, REG_HWCAP, 20) + FEAT_DEF(EVTSTRM, REG_HWCAP, 21) + FEAT_DEF(AES, REG_HWCAP2, 0) + FEAT_DEF(PMULL, REG_HWCAP2, 1) + FEAT_DEF(SHA1, REG_HWCAP2, 2) + FEAT_DEF(SHA2, REG_HWCAP2, 3) + FEAT_DEF(CRC32, REG_HWCAP2, 4) + FEAT_DEF(V7L, REG_PLATFORM, 0) +}; + +#elif defined RTE_ARCH_ARM64 +#define PLATFORM_STR "aarch64" +typedef Elf64_auxv_t _Elfx_auxv_t; + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(FP, REG_HWCAP, 0) + FEAT_DEF(NEON, REG_HWCAP, 1) + FEAT_DEF(EVTSTRM, REG_HWCAP, 2) + FEAT_DEF(AES, REG_HWCAP, 3) + FEAT_DEF(PMULL, REG_HWCAP, 4) + FEAT_DEF(SHA1, REG_HWCAP, 5) + FEAT_DEF(SHA2, REG_HWCAP, 6) + FEAT_DEF(CRC32, REG_HWCAP, 7) + FEAT_DEF(ATOMICS, REG_HWCAP, 8) + FEAT_DEF(AARCH64, REG_PLATFORM, 1) +}; +#endif /* RTE_ARCH */ + +/* + * Read AUXV software register and get cpu features for ARM + */ +static void +rte_cpu_get_features(hwcap_registers_t out) +{ + out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP); + out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2); + if (!rte_cpu_strcmp_auxval(AT_PLATFORM, PLATFORM_STR)) + out[REG_PLATFORM] = 0x0001; +} + +/* + * Checks if a particular flag is available on current machine. + */ +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature) +{ + const struct feature_entry *feat; + hwcap_registers_t regs = {0}; + + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return -ENOENT; + + feat = &rte_cpu_feature_table[feature]; + if (feat->reg == REG_NONE) + return -EFAULT; + + rte_cpu_get_features(regs); + return (regs[feat->reg] >> feat->bit) & 1; +} + +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature) +{ + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return NULL; + return rte_cpu_feature_table[feature].name; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cycles.c b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cycles.c new file mode 100644 index 00000000..3500d523 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_cycles.c @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#include "eal_private.h" + +uint64_t +get_tsc_freq_arch(void) +{ +#if defined RTE_ARCH_ARM64 && !defined RTE_ARM_EAL_RDTSC_USE_PMU + uint64_t freq; + asm volatile("mrs %0, cntfrq_el0" : "=r" (freq)); + return freq; +#else + return 0; +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_hypervisor.c new file mode 100644 index 00000000..08a1c97d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/arm/rte_hypervisor.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +enum rte_hypervisor +rte_hypervisor_get(void) +{ + return RTE_HYPERVISOR_UNKNOWN; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c new file mode 100644 index 00000000..e7a82452 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c @@ -0,0 +1,137 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "rte_cpuflags.h" + +#include +#include +#include +#include + +/* Symbolic values for the entries in the auxiliary table */ +#define AT_HWCAP 16 +#define AT_HWCAP2 26 + +/* software based registers */ +enum cpu_register_t { + REG_NONE = 0, + REG_HWCAP, + REG_HWCAP2, + REG_MAX +}; + +typedef uint32_t hwcap_registers_t[REG_MAX]; + +struct feature_entry { + uint32_t reg; + uint32_t bit; +#define CPU_FLAG_NAME_MAX_LEN 64 + char name[CPU_FLAG_NAME_MAX_LEN]; +}; + +#define FEAT_DEF(name, reg, bit) \ + [RTE_CPUFLAG_##name] = {reg, bit, #name}, + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(PPC_LE, REG_HWCAP, 0) + FEAT_DEF(TRUE_LE, REG_HWCAP, 1) + FEAT_DEF(PSERIES_PERFMON_COMPAT, REG_HWCAP, 6) + FEAT_DEF(VSX, REG_HWCAP, 7) + FEAT_DEF(ARCH_2_06, REG_HWCAP, 8) + FEAT_DEF(POWER6_EXT, REG_HWCAP, 9) + FEAT_DEF(DFP, REG_HWCAP, 10) + FEAT_DEF(PA6T, REG_HWCAP, 11) + FEAT_DEF(ARCH_2_05, REG_HWCAP, 12) + FEAT_DEF(ICACHE_SNOOP, REG_HWCAP, 13) + FEAT_DEF(SMT, REG_HWCAP, 14) + FEAT_DEF(BOOKE, REG_HWCAP, 15) + FEAT_DEF(CELLBE, REG_HWCAP, 16) + FEAT_DEF(POWER5_PLUS, REG_HWCAP, 17) + FEAT_DEF(POWER5, REG_HWCAP, 18) + FEAT_DEF(POWER4, REG_HWCAP, 19) + FEAT_DEF(NOTB, REG_HWCAP, 20) + FEAT_DEF(EFP_DOUBLE, REG_HWCAP, 21) + FEAT_DEF(EFP_SINGLE, REG_HWCAP, 22) + FEAT_DEF(SPE, REG_HWCAP, 23) + FEAT_DEF(UNIFIED_CACHE, REG_HWCAP, 24) + FEAT_DEF(4xxMAC, REG_HWCAP, 25) + FEAT_DEF(MMU, REG_HWCAP, 26) + FEAT_DEF(FPU, REG_HWCAP, 27) + FEAT_DEF(ALTIVEC, REG_HWCAP, 28) + FEAT_DEF(PPC601, REG_HWCAP, 29) + FEAT_DEF(PPC64, REG_HWCAP, 30) + FEAT_DEF(PPC32, REG_HWCAP, 31) + FEAT_DEF(TAR, REG_HWCAP2, 26) + FEAT_DEF(LSEL, REG_HWCAP2, 27) + FEAT_DEF(EBB, REG_HWCAP2, 28) + FEAT_DEF(DSCR, REG_HWCAP2, 29) + FEAT_DEF(HTM, REG_HWCAP2, 30) + FEAT_DEF(ARCH_2_07, REG_HWCAP2, 31) +}; + +/* + * Read AUXV software register and get cpu features for Power + */ +static void +rte_cpu_get_features(hwcap_registers_t out) +{ + out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP); + out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2); +} + +/* + * Checks if a particular flag is available on current machine. + */ +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature) +{ + const struct feature_entry *feat; + hwcap_registers_t regs = {0}; + + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return -ENOENT; + + feat = &rte_cpu_feature_table[feature]; + if (feat->reg == REG_NONE) + return -EFAULT; + + rte_cpu_get_features(regs); + return (regs[feat->reg] >> feat->bit) & 1; +} + +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature) +{ + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return NULL; + return rte_cpu_feature_table[feature].name; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cycles.c b/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cycles.c new file mode 100644 index 00000000..851fd025 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_cycles.c @@ -0,0 +1,7 @@ +#include "eal_private.h" + +uint64_t +get_tsc_freq_arch(void) +{ + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c new file mode 100644 index 00000000..08a1c97d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +enum rte_hypervisor +rte_hypervisor_get(void) +{ + return RTE_HYPERVISOR_UNKNOWN; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/meson.build b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/meson.build new file mode 100644 index 00000000..4e0f7790 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +eal_common_arch_sources = files('rte_spinlock.c', 'rte_cpuflags.c', + 'rte_cycles.c') diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuflags.c new file mode 100644 index 00000000..053612d6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuflags.c @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#include "rte_cpuflags.h" + +#include +#include +#include + +#include "rte_cpuid.h" + +/** + * Struct to hold a processor feature entry + */ +struct feature_entry { + uint32_t leaf; /**< cpuid leaf */ + uint32_t subleaf; /**< cpuid subleaf */ + uint32_t reg; /**< cpuid register */ + uint32_t bit; /**< cpuid register bit */ +#define CPU_FLAG_NAME_MAX_LEN 64 + char name[CPU_FLAG_NAME_MAX_LEN]; /**< String for printing */ +}; + +#define FEAT_DEF(name, leaf, subleaf, reg, bit) \ + [RTE_CPUFLAG_##name] = {leaf, subleaf, reg, bit, #name }, + +const struct feature_entry rte_cpu_feature_table[] = { + FEAT_DEF(SSE3, 0x00000001, 0, RTE_REG_ECX, 0) + FEAT_DEF(PCLMULQDQ, 0x00000001, 0, RTE_REG_ECX, 1) + FEAT_DEF(DTES64, 0x00000001, 0, RTE_REG_ECX, 2) + FEAT_DEF(MONITOR, 0x00000001, 0, RTE_REG_ECX, 3) + FEAT_DEF(DS_CPL, 0x00000001, 0, RTE_REG_ECX, 4) + FEAT_DEF(VMX, 0x00000001, 0, RTE_REG_ECX, 5) + FEAT_DEF(SMX, 0x00000001, 0, RTE_REG_ECX, 6) + FEAT_DEF(EIST, 0x00000001, 0, RTE_REG_ECX, 7) + FEAT_DEF(TM2, 0x00000001, 0, RTE_REG_ECX, 8) + FEAT_DEF(SSSE3, 0x00000001, 0, RTE_REG_ECX, 9) + FEAT_DEF(CNXT_ID, 0x00000001, 0, RTE_REG_ECX, 10) + FEAT_DEF(FMA, 0x00000001, 0, RTE_REG_ECX, 12) + FEAT_DEF(CMPXCHG16B, 0x00000001, 0, RTE_REG_ECX, 13) + FEAT_DEF(XTPR, 0x00000001, 0, RTE_REG_ECX, 14) + FEAT_DEF(PDCM, 0x00000001, 0, RTE_REG_ECX, 15) + FEAT_DEF(PCID, 0x00000001, 0, RTE_REG_ECX, 17) + FEAT_DEF(DCA, 0x00000001, 0, RTE_REG_ECX, 18) + FEAT_DEF(SSE4_1, 0x00000001, 0, RTE_REG_ECX, 19) + FEAT_DEF(SSE4_2, 0x00000001, 0, RTE_REG_ECX, 20) + FEAT_DEF(X2APIC, 0x00000001, 0, RTE_REG_ECX, 21) + FEAT_DEF(MOVBE, 0x00000001, 0, RTE_REG_ECX, 22) + FEAT_DEF(POPCNT, 0x00000001, 0, RTE_REG_ECX, 23) + FEAT_DEF(TSC_DEADLINE, 0x00000001, 0, RTE_REG_ECX, 24) + FEAT_DEF(AES, 0x00000001, 0, RTE_REG_ECX, 25) + FEAT_DEF(XSAVE, 0x00000001, 0, RTE_REG_ECX, 26) + FEAT_DEF(OSXSAVE, 0x00000001, 0, RTE_REG_ECX, 27) + FEAT_DEF(AVX, 0x00000001, 0, RTE_REG_ECX, 28) + FEAT_DEF(F16C, 0x00000001, 0, RTE_REG_ECX, 29) + FEAT_DEF(RDRAND, 0x00000001, 0, RTE_REG_ECX, 30) + FEAT_DEF(HYPERVISOR, 0x00000001, 0, RTE_REG_ECX, 31) + + FEAT_DEF(FPU, 0x00000001, 0, RTE_REG_EDX, 0) + FEAT_DEF(VME, 0x00000001, 0, RTE_REG_EDX, 1) + FEAT_DEF(DE, 0x00000001, 0, RTE_REG_EDX, 2) + FEAT_DEF(PSE, 0x00000001, 0, RTE_REG_EDX, 3) + FEAT_DEF(TSC, 0x00000001, 0, RTE_REG_EDX, 4) + FEAT_DEF(MSR, 0x00000001, 0, RTE_REG_EDX, 5) + FEAT_DEF(PAE, 0x00000001, 0, RTE_REG_EDX, 6) + FEAT_DEF(MCE, 0x00000001, 0, RTE_REG_EDX, 7) + FEAT_DEF(CX8, 0x00000001, 0, RTE_REG_EDX, 8) + FEAT_DEF(APIC, 0x00000001, 0, RTE_REG_EDX, 9) + FEAT_DEF(SEP, 0x00000001, 0, RTE_REG_EDX, 11) + FEAT_DEF(MTRR, 0x00000001, 0, RTE_REG_EDX, 12) + FEAT_DEF(PGE, 0x00000001, 0, RTE_REG_EDX, 13) + FEAT_DEF(MCA, 0x00000001, 0, RTE_REG_EDX, 14) + FEAT_DEF(CMOV, 0x00000001, 0, RTE_REG_EDX, 15) + FEAT_DEF(PAT, 0x00000001, 0, RTE_REG_EDX, 16) + FEAT_DEF(PSE36, 0x00000001, 0, RTE_REG_EDX, 17) + FEAT_DEF(PSN, 0x00000001, 0, RTE_REG_EDX, 18) + FEAT_DEF(CLFSH, 0x00000001, 0, RTE_REG_EDX, 19) + FEAT_DEF(DS, 0x00000001, 0, RTE_REG_EDX, 21) + FEAT_DEF(ACPI, 0x00000001, 0, RTE_REG_EDX, 22) + FEAT_DEF(MMX, 0x00000001, 0, RTE_REG_EDX, 23) + FEAT_DEF(FXSR, 0x00000001, 0, RTE_REG_EDX, 24) + FEAT_DEF(SSE, 0x00000001, 0, RTE_REG_EDX, 25) + FEAT_DEF(SSE2, 0x00000001, 0, RTE_REG_EDX, 26) + FEAT_DEF(SS, 0x00000001, 0, RTE_REG_EDX, 27) + FEAT_DEF(HTT, 0x00000001, 0, RTE_REG_EDX, 28) + FEAT_DEF(TM, 0x00000001, 0, RTE_REG_EDX, 29) + FEAT_DEF(PBE, 0x00000001, 0, RTE_REG_EDX, 31) + + FEAT_DEF(DIGTEMP, 0x00000006, 0, RTE_REG_EAX, 0) + FEAT_DEF(TRBOBST, 0x00000006, 0, RTE_REG_EAX, 1) + FEAT_DEF(ARAT, 0x00000006, 0, RTE_REG_EAX, 2) + FEAT_DEF(PLN, 0x00000006, 0, RTE_REG_EAX, 4) + FEAT_DEF(ECMD, 0x00000006, 0, RTE_REG_EAX, 5) + FEAT_DEF(PTM, 0x00000006, 0, RTE_REG_EAX, 6) + + FEAT_DEF(MPERF_APERF_MSR, 0x00000006, 0, RTE_REG_ECX, 0) + FEAT_DEF(ACNT2, 0x00000006, 0, RTE_REG_ECX, 1) + FEAT_DEF(ENERGY_EFF, 0x00000006, 0, RTE_REG_ECX, 3) + + FEAT_DEF(FSGSBASE, 0x00000007, 0, RTE_REG_EBX, 0) + FEAT_DEF(BMI1, 0x00000007, 0, RTE_REG_EBX, 2) + FEAT_DEF(HLE, 0x00000007, 0, RTE_REG_EBX, 4) + FEAT_DEF(AVX2, 0x00000007, 0, RTE_REG_EBX, 5) + FEAT_DEF(SMEP, 0x00000007, 0, RTE_REG_EBX, 6) + FEAT_DEF(BMI2, 0x00000007, 0, RTE_REG_EBX, 7) + FEAT_DEF(ERMS, 0x00000007, 0, RTE_REG_EBX, 8) + FEAT_DEF(INVPCID, 0x00000007, 0, RTE_REG_EBX, 10) + FEAT_DEF(RTM, 0x00000007, 0, RTE_REG_EBX, 11) + FEAT_DEF(AVX512F, 0x00000007, 0, RTE_REG_EBX, 16) + + FEAT_DEF(LAHF_SAHF, 0x80000001, 0, RTE_REG_ECX, 0) + FEAT_DEF(LZCNT, 0x80000001, 0, RTE_REG_ECX, 4) + + FEAT_DEF(SYSCALL, 0x80000001, 0, RTE_REG_EDX, 11) + FEAT_DEF(XD, 0x80000001, 0, RTE_REG_EDX, 20) + FEAT_DEF(1GB_PG, 0x80000001, 0, RTE_REG_EDX, 26) + FEAT_DEF(RDTSCP, 0x80000001, 0, RTE_REG_EDX, 27) + FEAT_DEF(EM64T, 0x80000001, 0, RTE_REG_EDX, 29) + + FEAT_DEF(INVTSC, 0x80000007, 0, RTE_REG_EDX, 8) +}; + +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature) +{ + const struct feature_entry *feat; + cpuid_registers_t regs; + unsigned int maxleaf; + + if (feature >= RTE_CPUFLAG_NUMFLAGS) + /* Flag does not match anything in the feature tables */ + return -ENOENT; + + feat = &rte_cpu_feature_table[feature]; + + if (!feat->leaf) + /* This entry in the table wasn't filled out! */ + return -EFAULT; + + maxleaf = __get_cpuid_max(feat->leaf & 0x80000000, NULL); + + if (maxleaf < feat->leaf) + return 0; + + __cpuid_count(feat->leaf, feat->subleaf, + regs[RTE_REG_EAX], regs[RTE_REG_EBX], + regs[RTE_REG_ECX], regs[RTE_REG_EDX]); + + /* check if the feature is enabled */ + return (regs[feat->reg] >> feat->bit) & 1; +} + +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature) +{ + if (feature >= RTE_CPUFLAG_NUMFLAGS) + return NULL; + return rte_cpu_feature_table[feature].name; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuid.h b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuid.h new file mode 100644 index 00000000..b773ad93 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cpuid.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef RTE_CPUID_H +#define RTE_CPUID_H + +#include + +enum cpu_register_t { + RTE_REG_EAX = 0, + RTE_REG_EBX, + RTE_REG_ECX, + RTE_REG_EDX, +}; + +typedef uint32_t cpuid_registers_t[4]; + +#endif /* RTE_CPUID_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cycles.c b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cycles.c new file mode 100644 index 00000000..23c67d24 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_cycles.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include + +#include + +#include "eal_private.h" + +static unsigned int +rte_cpu_get_model(uint32_t fam_mod_step) +{ + uint32_t family, model, ext_model; + + family = (fam_mod_step >> 8) & 0xf; + model = (fam_mod_step >> 4) & 0xf; + + if (family == 6 || family == 15) { + ext_model = (fam_mod_step >> 16) & 0xf; + model += (ext_model << 4); + } + + return model; +} + +static int32_t +rdmsr(int msr, uint64_t *val) +{ +#ifdef RTE_EXEC_ENV_LINUXAPP + int fd; + int ret; + + fd = open("/dev/cpu/0/msr", O_RDONLY); + if (fd < 0) + return fd; + + ret = pread(fd, val, sizeof(uint64_t), msr); + + close(fd); + + return ret; +#else + RTE_SET_USED(msr); + RTE_SET_USED(val); + + return -1; +#endif +} + +static uint32_t +check_model_wsm_nhm(uint8_t model) +{ + switch (model) { + /* Westmere */ + case 0x25: + case 0x2C: + case 0x2F: + /* Nehalem */ + case 0x1E: + case 0x1F: + case 0x1A: + case 0x2E: + return 1; + } + + return 0; +} + +static uint32_t +check_model_gdm_dnv(uint8_t model) +{ + switch (model) { + /* Goldmont */ + case 0x5C: + /* Denverton */ + case 0x5F: + return 1; + } + + return 0; +} + +uint64_t +get_tsc_freq_arch(void) +{ + uint64_t tsc_hz = 0; + uint32_t a, b, c, d, maxleaf; + uint8_t mult, model; + int32_t ret; + + /* + * Time Stamp Counter and Nominal Core Crystal Clock + * Information Leaf + */ + maxleaf = __get_cpuid_max(0, NULL); + + if (maxleaf >= 0x15) { + __cpuid(0x15, a, b, c, d); + + /* EBX : TSC/Crystal ratio, ECX : Crystal Hz */ + if (b && c) + return c * (b / a); + } + + __cpuid(0x1, a, b, c, d); + model = rte_cpu_get_model(a); + + if (check_model_wsm_nhm(model)) + mult = 133; + else if ((c & bit_AVX) || check_model_gdm_dnv(model)) + mult = 100; + else + return 0; + + ret = rdmsr(0xCE, &tsc_hz); + if (ret < 0) + return 0; + + return ((tsc_hz >> 8) & 0xff) * mult * 1E6; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_hypervisor.c new file mode 100644 index 00000000..c38cfc09 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_hypervisor.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +#include +#include + +#include "rte_cpuflags.h" +#include "rte_cpuid.h" + +/* See http://lwn.net/Articles/301888/ */ +#define HYPERVISOR_INFO_LEAF 0x40000000 + +enum rte_hypervisor +rte_hypervisor_get(void) +{ + cpuid_registers_t regs; + int reg; + char name[13]; + + if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_HYPERVISOR)) + return RTE_HYPERVISOR_NONE; + + __cpuid(HYPERVISOR_INFO_LEAF, + regs[RTE_REG_EAX], regs[RTE_REG_EBX], + regs[RTE_REG_ECX], regs[RTE_REG_EDX]); + for (reg = 1; reg < 4; reg++) + memcpy(name + (reg - 1) * 4, ®s[reg], 4); + name[12] = '\0'; + + if (strcmp("KVMKVMKVM", name) == 0) + return RTE_HYPERVISOR_KVM; + if (strcmp("Microsoft Hv", name) == 0) + return RTE_HYPERVISOR_HYPERV; + if (strcmp("VMwareVMware", name) == 0) + return RTE_HYPERVISOR_VMWARE; + return RTE_HYPERVISOR_UNKNOWN; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_memcpy.c b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_memcpy.c new file mode 100644 index 00000000..648c8f68 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_memcpy.c @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include +#include +#include + +void *(*rte_memcpy_ptr)(void *dst, const void *src, size_t n) = NULL; + +RTE_INIT(rte_memcpy_init) +{ +#ifdef CC_SUPPORT_AVX512F + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) { + rte_memcpy_ptr = rte_memcpy_avx512f; + RTE_LOG(DEBUG, EAL, "AVX512 memcpy is using!\n"); + return; + } +#endif +#ifdef CC_SUPPORT_AVX2 + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) { + rte_memcpy_ptr = rte_memcpy_avx2; + RTE_LOG(DEBUG, EAL, "AVX2 memcpy is using!\n"); + return; + } +#endif + rte_memcpy_ptr = rte_memcpy_sse; + RTE_LOG(DEBUG, EAL, "Default SSE/AVX memcpy is using!\n"); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_spinlock.c b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_spinlock.c new file mode 100644 index 00000000..34890ea8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/arch/x86/rte_spinlock.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include "rte_cpuflags.h" + +uint8_t rte_rtm_supported; /* cache the flag to avoid the overhead + of the rte_cpu_get_flag_enabled function */ + +RTE_INIT(rte_rtm_init) +{ + rte_rtm_supported = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RTM); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c new file mode 100644 index 00000000..0943851c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c @@ -0,0 +1,244 @@ +/*- + * BSD LICENSE + * + * Copyright 2016 NXP. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of NXP nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include + +#include "eal_private.h" + +struct rte_bus_list rte_bus_list = + TAILQ_HEAD_INITIALIZER(rte_bus_list); + +void +rte_bus_register(struct rte_bus *bus) +{ + RTE_VERIFY(bus); + RTE_VERIFY(bus->name && strlen(bus->name)); + /* A bus should mandatorily have the scan implemented */ + RTE_VERIFY(bus->scan); + RTE_VERIFY(bus->probe); + RTE_VERIFY(bus->find_device); + /* Buses supporting driver plug also require unplug. */ + RTE_VERIFY(!bus->plug || bus->unplug); + + TAILQ_INSERT_TAIL(&rte_bus_list, bus, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name); +} + +void +rte_bus_unregister(struct rte_bus *bus) +{ + TAILQ_REMOVE(&rte_bus_list, bus, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] bus.\n", bus->name); +} + +/* Scan all the buses for registered devices */ +int +rte_bus_scan(void) +{ + int ret; + struct rte_bus *bus = NULL; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + ret = bus->scan(); + if (ret) + RTE_LOG(ERR, EAL, "Scan for (%s) bus failed.\n", + bus->name); + } + + return 0; +} + +/* Probe all devices of all buses */ +int +rte_bus_probe(void) +{ + int ret; + struct rte_bus *bus, *vbus = NULL; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + if (!strcmp(bus->name, "vdev")) { + vbus = bus; + continue; + } + + ret = bus->probe(); + if (ret) + RTE_LOG(ERR, EAL, "Bus (%s) probe failed.\n", + bus->name); + } + + if (vbus) { + ret = vbus->probe(); + if (ret) + RTE_LOG(ERR, EAL, "Bus (%s) probe failed.\n", + vbus->name); + } + + return 0; +} + +/* Dump information of a single bus */ +static int +bus_dump_one(FILE *f, struct rte_bus *bus) +{ + int ret; + + /* For now, dump only the bus name */ + ret = fprintf(f, " %s\n", bus->name); + + /* Error in case of inability in writing to stream */ + if (ret < 0) + return ret; + + return 0; +} + +void +rte_bus_dump(FILE *f) +{ + int ret; + struct rte_bus *bus; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + ret = bus_dump_one(f, bus); + if (ret) { + RTE_LOG(ERR, EAL, "Unable to write to stream (%d)\n", + ret); + break; + } + } +} + +struct rte_bus * +rte_bus_find(const struct rte_bus *start, rte_bus_cmp_t cmp, + const void *data) +{ + struct rte_bus *bus; + + if (start != NULL) + bus = TAILQ_NEXT(start, next); + else + bus = TAILQ_FIRST(&rte_bus_list); + while (bus != NULL) { + if (cmp(bus, data) == 0) + break; + bus = TAILQ_NEXT(bus, next); + } + return bus; +} + +static int +cmp_rte_device(const struct rte_device *dev1, const void *_dev2) +{ + const struct rte_device *dev2 = _dev2; + + return dev1 != dev2; +} + +static int +bus_find_device(const struct rte_bus *bus, const void *_dev) +{ + struct rte_device *dev; + + dev = bus->find_device(NULL, cmp_rte_device, _dev); + return dev == NULL; +} + +struct rte_bus * +rte_bus_find_by_device(const struct rte_device *dev) +{ + return rte_bus_find(NULL, bus_find_device, (const void *)dev); +} + +static int +cmp_bus_name(const struct rte_bus *bus, const void *_name) +{ + const char *name = _name; + + return strcmp(bus->name, name); +} + +struct rte_bus * +rte_bus_find_by_name(const char *busname) +{ + return rte_bus_find(NULL, cmp_bus_name, (const void *)busname); +} + +static int +bus_can_parse(const struct rte_bus *bus, const void *_name) +{ + const char *name = _name; + + return !(bus->parse && bus->parse(name, NULL) == 0); +} + +struct rte_bus * +rte_bus_find_by_device_name(const char *str) +{ + char name[RTE_DEV_NAME_MAX_LEN]; + char *c; + + strlcpy(name, str, sizeof(name)); + c = strchr(name, ','); + if (c != NULL) + c[0] = '\0'; + return rte_bus_find(NULL, bus_can_parse, name); +} + + +/* + * Get iommu class of devices on the bus. + */ +enum rte_iova_mode +rte_bus_get_iommu_class(void) +{ + int mode = RTE_IOVA_DC; + struct rte_bus *bus; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + + if (bus->get_iommu_class) + mode |= bus->get_iommu_class(); + } + + if (mode != RTE_IOVA_VA) { + /* Use default IOVA mode */ + mode = RTE_IOVA_PA; + } + return mode; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c new file mode 100644 index 00000000..404a9065 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#include +#include +#include + +#include +#include + +struct rte_class_list rte_class_list = + TAILQ_HEAD_INITIALIZER(rte_class_list); + +__rte_experimental void +rte_class_register(struct rte_class *class) +{ + RTE_VERIFY(class); + RTE_VERIFY(class->name && strlen(class->name)); + + TAILQ_INSERT_TAIL(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] device class.\n", class->name); +} + +__rte_experimental void +rte_class_unregister(struct rte_class *class) +{ + TAILQ_REMOVE(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] device class.\n", class->name); +} + +__rte_experimental +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data) +{ + struct rte_class *cls; + + if (start != NULL) + cls = TAILQ_NEXT(start, next); + else + cls = TAILQ_FIRST(&rte_class_list); + while (cls != NULL) { + if (cmp(cls, data) == 0) + break; + cls = TAILQ_NEXT(cls, next); + } + return cls; +} + +static int +cmp_class_name(const struct rte_class *class, const void *_name) +{ + const char *name = _name; + + return strcmp(class->name, name); +} + +__rte_experimental +struct rte_class * +rte_class_find_by_name(const char *name) +{ + return rte_class_find(NULL, cmp_class_name, (const void *)name); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c new file mode 100644 index 00000000..3a055f7c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include +#include + +/** + * Checks if the machine is adequate for running the binary. If it is not, the + * program exits with status 1. + */ +void +rte_cpu_check_supported(void) +{ + if (!rte_cpu_is_supported()) + exit(1); +} + +int +rte_cpu_is_supported(void) +{ + /* This is generated at compile-time by the build system */ + static const enum rte_cpu_flag_t compile_time_flags[] = { + RTE_COMPILE_TIME_CPUFLAGS + }; + unsigned count = RTE_DIM(compile_time_flags), i; + int ret; + + for (i = 0; i < count; i++) { + ret = rte_cpu_get_flag_enabled(compile_time_flags[i]); + + if (ret < 0) { + fprintf(stderr, + "ERROR: CPU feature flag lookup failed with error %d\n", + ret); + return 0; + } + if (!ret) { + fprintf(stderr, + "ERROR: This system does not support \"%s\".\n" + "Please check that RTE_MACHINE is set correctly.\n", + rte_cpu_get_flag_name(compile_time_flags[i])); + return 0; + } + } + + return 1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c new file mode 100644 index 00000000..678dbcac --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c @@ -0,0 +1,566 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +/** + * The device event callback description. + * + * It contains callback address to be registered by user application, + * the pointer to the parameters for callback, and the device name. + */ +struct dev_event_callback { + TAILQ_ENTRY(dev_event_callback) next; /**< Callbacks list */ + rte_dev_event_cb_fn cb_fn; /**< Callback address */ + void *cb_arg; /**< Callback parameter */ + char *dev_name; /**< Callback device name, NULL is for all device */ + uint32_t active; /**< Callback is executing */ +}; + +/** @internal Structure to keep track of registered callbacks */ +TAILQ_HEAD(dev_event_cb_list, dev_event_callback); + +/* The device event callback list for all registered callbacks. */ +static struct dev_event_cb_list dev_event_cbs; + +/* spinlock for device callbacks */ +static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER; + +struct dev_next_ctx { + struct rte_dev_iterator *it; + const char *bus_str; + const char *cls_str; +}; + +#define CTX(it, bus_str, cls_str) \ + (&(const struct dev_next_ctx){ \ + .it = it, \ + .bus_str = bus_str, \ + .cls_str = cls_str, \ + }) + +#define ITCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->it) + +#define BUSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->bus_str) + +#define CLSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->cls_str) + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +int rte_eal_dev_attach(const char *name, const char *devargs) +{ + struct rte_bus *bus; + + if (name == NULL || devargs == NULL) { + RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n"); + return -EINVAL; + } + + bus = rte_bus_find_by_device_name(name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Unable to find a bus for the device '%s'\n", + name); + return -EINVAL; + } + if (strcmp(bus->name, "pci") == 0 || strcmp(bus->name, "vdev") == 0) + return rte_eal_hotplug_add(bus->name, name, devargs); + + RTE_LOG(ERR, EAL, + "Device attach is only supported for PCI and vdev devices.\n"); + + return -ENOTSUP; +} + +int rte_eal_dev_detach(struct rte_device *dev) +{ + struct rte_bus *bus; + int ret; + + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Invalid device provided.\n"); + return -EINVAL; + } + + bus = rte_bus_find_by_device(dev); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", + dev->name); + return -EINVAL; + } + + if (bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Bus function not supported\n"); + return -ENOTSUP; + } + + ret = bus->unplug(dev); + if (ret) + RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", + dev->name); + return ret; +} + +int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, + const char *devargs) +{ + struct rte_bus *bus; + struct rte_device *dev; + struct rte_devargs *da; + int ret; + + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); + return -ENOENT; + } + + if (bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + bus->name); + return -ENOTSUP; + } + + da = calloc(1, sizeof(*da)); + if (da == NULL) + return -ENOMEM; + + ret = rte_devargs_parsef(da, "%s:%s,%s", + busname, devname, devargs); + if (ret) + goto err_devarg; + + ret = rte_devargs_insert(da); + if (ret) + goto err_devarg; + + ret = bus->scan(); + if (ret) + goto err_devarg; + + dev = bus->find_device(NULL, cmp_dev_name, devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", + devname); + ret = -ENODEV; + goto err_devarg; + } + + if (dev->driver != NULL) { + RTE_LOG(ERR, EAL, "Device is already plugged\n"); + return -EEXIST; + } + + ret = bus->plug(dev); + if (ret) { + RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", + dev->name); + goto err_devarg; + } + return 0; + +err_devarg: + if (rte_devargs_remove(busname, devname)) { + free(da->args); + free(da); + } + return ret; +} + +int __rte_experimental +rte_eal_hotplug_remove(const char *busname, const char *devname) +{ + struct rte_bus *bus; + struct rte_device *dev; + int ret; + + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); + return -ENOENT; + } + + if (bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + bus->name); + return -ENOTSUP; + } + + dev = bus->find_device(NULL, cmp_dev_name, devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname); + return -EINVAL; + } + + if (dev->driver == NULL) { + RTE_LOG(ERR, EAL, "Device is already unplugged\n"); + return -ENOENT; + } + + ret = bus->unplug(dev); + if (ret) + RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", + dev->name); + rte_devargs_remove(busname, devname); + return ret; +} + +int __rte_experimental +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + struct dev_event_callback *event_cb; + int ret; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + + if (TAILQ_EMPTY(&dev_event_cbs)) + TAILQ_INIT(&dev_event_cbs); + + TAILQ_FOREACH(event_cb, &dev_event_cbs, next) { + if (event_cb->cb_fn == cb_fn && event_cb->cb_arg == cb_arg) { + if (device_name == NULL && event_cb->dev_name == NULL) + break; + if (device_name == NULL || event_cb->dev_name == NULL) + continue; + if (!strcmp(event_cb->dev_name, device_name)) + break; + } + } + + /* create a new callback. */ + if (event_cb == NULL) { + event_cb = malloc(sizeof(struct dev_event_callback)); + if (event_cb != NULL) { + event_cb->cb_fn = cb_fn; + event_cb->cb_arg = cb_arg; + event_cb->active = 0; + if (!device_name) { + event_cb->dev_name = NULL; + } else { + event_cb->dev_name = strdup(device_name); + if (event_cb->dev_name == NULL) { + ret = -ENOMEM; + goto error; + } + } + TAILQ_INSERT_TAIL(&dev_event_cbs, event_cb, next); + } else { + RTE_LOG(ERR, EAL, + "Failed to allocate memory for device " + "event callback."); + ret = -ENOMEM; + goto error; + } + } else { + RTE_LOG(ERR, EAL, + "The callback is already exist, no need " + "to register again.\n"); + ret = -EEXIST; + } + + rte_spinlock_unlock(&dev_event_lock); + return 0; +error: + free(event_cb); + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +int __rte_experimental +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + int ret = 0; + struct dev_event_callback *event_cb, *next; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + /*walk through the callbacks and remove all that match. */ + for (event_cb = TAILQ_FIRST(&dev_event_cbs); event_cb != NULL; + event_cb = next) { + + next = TAILQ_NEXT(event_cb, next); + + if (device_name != NULL && event_cb->dev_name != NULL) { + if (!strcmp(event_cb->dev_name, device_name)) { + if (event_cb->cb_fn != cb_fn || + (cb_arg != (void *)-1 && + event_cb->cb_arg != cb_arg)) + continue; + } + } else if (device_name != NULL) { + continue; + } + + /* + * if this callback is not executing right now, + * then remove it. + */ + if (event_cb->active == 0) { + TAILQ_REMOVE(&dev_event_cbs, event_cb, next); + free(event_cb); + ret++; + } else { + continue; + } + } + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +void +dev_callback_process(char *device_name, enum rte_dev_event_type event) +{ + struct dev_event_callback *cb_lst; + + if (device_name == NULL) + return; + + rte_spinlock_lock(&dev_event_lock); + + TAILQ_FOREACH(cb_lst, &dev_event_cbs, next) { + if (cb_lst->dev_name) { + if (strcmp(cb_lst->dev_name, device_name)) + continue; + } + cb_lst->active = 1; + rte_spinlock_unlock(&dev_event_lock); + cb_lst->cb_fn(device_name, event, + cb_lst->cb_arg); + rte_spinlock_lock(&dev_event_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&dev_event_lock); +} + +__rte_experimental +int +rte_dev_iterator_init(struct rte_dev_iterator *it, + const char *dev_str) +{ + struct rte_devargs devargs; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + + /* Having both bus_str and cls_str NULL is illegal, + * marking this iterator as invalid unless + * everything goes well. + */ + it->bus_str = NULL; + it->cls_str = NULL; + + devargs.data = dev_str; + if (rte_devargs_layers_parse(&devargs, dev_str)) + goto get_out; + + bus = devargs.bus; + cls = devargs.cls; + /* The string should have at least + * one layer specified. + */ + if (bus == NULL && cls == NULL) { + RTE_LOG(ERR, EAL, + "Either bus or class must be specified.\n"); + rte_errno = EINVAL; + goto get_out; + } + if (bus != NULL && bus->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Bus %s not supported\n", bus->name); + rte_errno = ENOTSUP; + goto get_out; + } + if (cls != NULL && cls->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Class %s not supported\n", cls->name); + rte_errno = ENOTSUP; + goto get_out; + } + it->bus_str = devargs.bus_str; + it->cls_str = devargs.cls_str; + it->dev_str = dev_str; + it->bus = bus; + it->cls = cls; + it->device = NULL; + it->class_device = NULL; +get_out: + return -rte_errno; +} + +static char * +dev_str_sane_copy(const char *str) +{ + size_t end; + char *copy; + + end = strcspn(str, ",/"); + if (str[end] == ',') { + copy = strdup(&str[end + 1]); + } else { + /* '/' or '\0' */ + copy = strdup(""); + } + if (copy == NULL) { + rte_errno = ENOMEM; + } else { + char *slash; + + slash = strchr(copy, '/'); + if (slash != NULL) + slash[0] = '\0'; + } + return copy; +} + +static int +class_next_dev_cmp(const struct rte_class *cls, + const void *ctx) +{ + struct rte_dev_iterator *it; + const char *cls_str = NULL; + void *dev; + + if (cls->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + cls_str = CLSCTX(ctx); + dev = it->class_device; + /* it->cls_str != NULL means a class + * was specified in the devstr. + */ + if (it->cls_str != NULL && cls != it->cls) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + dev = cls->dev_iterate(dev, cls_str, it); + it->class_device = dev; + return dev == NULL; +} + +static int +bus_next_dev_cmp(const struct rte_bus *bus, + const void *ctx) +{ + struct rte_device *dev = NULL; + struct rte_class *cls = NULL; + struct rte_dev_iterator *it; + const char *bus_str = NULL; + + if (bus->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + bus_str = BUSCTX(ctx); + dev = it->device; + /* it->bus_str != NULL means a bus + * was specified in the devstr. + */ + if (it->bus_str != NULL && bus != it->bus) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + if (it->cls_str == NULL) { + dev = bus->dev_iterate(dev, bus_str, it); + goto end; + } + /* cls_str != NULL */ + if (dev == NULL) { +next_dev_on_bus: + dev = bus->dev_iterate(dev, bus_str, it); + it->device = dev; + } + if (dev == NULL) + return 1; + if (it->cls != NULL) + cls = TAILQ_PREV(it->cls, rte_class_list, next); + cls = rte_class_find(cls, class_next_dev_cmp, ctx); + if (cls != NULL) { + it->cls = cls; + goto end; + } + goto next_dev_on_bus; +end: + it->device = dev; + return dev == NULL; +} +__rte_experimental +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it) +{ + struct rte_bus *bus = NULL; + int old_errno = rte_errno; + char *bus_str = NULL; + char *cls_str = NULL; + + rte_errno = 0; + if (it->bus_str == NULL && it->cls_str == NULL) { + /* Invalid iterator. */ + rte_errno = EINVAL; + return NULL; + } + if (it->bus != NULL) + bus = TAILQ_PREV(it->bus, rte_bus_list, next); + if (it->bus_str != NULL) { + bus_str = dev_str_sane_copy(it->bus_str); + if (bus_str == NULL) + goto out; + } + if (it->cls_str != NULL) { + cls_str = dev_str_sane_copy(it->cls_str); + if (cls_str == NULL) + goto out; + } + while ((bus = rte_bus_find(bus, bus_next_dev_cmp, + CTX(it, bus_str, cls_str)))) { + if (it->device != NULL) { + it->bus = bus; + goto out; + } + if (it->bus_str != NULL || + rte_errno != 0) + break; + } + if (rte_errno == 0) + rte_errno = old_errno; +out: + free(bus_str); + free(cls_str); + return it->device; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c new file mode 100644 index 00000000..c9e13e20 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2014 6WIND S.A. + */ + +/* This file manages the list of devices and their arguments, as given + * by the user at startup + * + * Code here should not call rte_log since the EAL environment + * may not be initialized. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "eal_private.h" + +/** user device double-linked queue type definition */ +TAILQ_HEAD(rte_devargs_list, rte_devargs); + +/** Global list of user devices */ +struct rte_devargs_list devargs_list = + TAILQ_HEAD_INITIALIZER(devargs_list); + +int +rte_eal_parse_devargs_str(const char *devargs_str, + char **drvname, char **drvargs) +{ + char *sep; + + if ((devargs_str) == NULL || (drvname) == NULL || (drvargs == NULL)) + return -1; + + *drvname = strdup(devargs_str); + if (*drvname == NULL) + return -1; + + /* set the first ',' to '\0' to split name and arguments */ + sep = strchr(*drvname, ','); + if (sep != NULL) { + sep[0] = '\0'; + *drvargs = strdup(sep + 1); + } else { + *drvargs = strdup(""); + } + + if (*drvargs == NULL) { + free(*drvname); + *drvname = NULL; + return -1; + } + return 0; +} + +static size_t +devargs_layer_count(const char *s) +{ + size_t i = s ? 1 : 0; + + while (s != NULL && s[0] != '\0') { + i += s[0] == '/'; + s++; + } + return i; +} + +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr) +{ + struct { + const char *key; + const char *str; + struct rte_kvargs *kvlist; + } layers[] = { + { "bus=", NULL, NULL, }, + { "class=", NULL, NULL, }, + { "driver=", NULL, NULL, }, + }; + struct rte_kvargs_pair *kv = NULL; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + const char *s = devstr; + size_t nblayer; + size_t i = 0; + int ret = 0; + + /* Split each sub-lists. */ + nblayer = devargs_layer_count(devstr); + if (nblayer > RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Invalid format: too many layers (%zu)\n", + nblayer); + ret = -E2BIG; + goto get_out; + } + + /* If the devargs points the devstr + * as source data, then it should not allocate + * anything and keep referring only to it. + */ + if (devargs->data != devstr) { + devargs->data = strdup(devstr); + if (devargs->data == NULL) { + RTE_LOG(ERR, EAL, "OOM\n"); + ret = -ENOMEM; + goto get_out; + } + s = devargs->data; + } + + while (s != NULL) { + if (i >= RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Unrecognized layer %s\n", s); + ret = -EINVAL; + goto get_out; + } + /* + * The last layer is free-form. + * The "driver" key is not required (but accepted). + */ + if (strncmp(layers[i].key, s, strlen(layers[i].key)) && + i != RTE_DIM(layers) - 1) + goto next_layer; + layers[i].str = s; + layers[i].kvlist = rte_kvargs_parse_delim(s, NULL, "/"); + if (layers[i].kvlist == NULL) { + RTE_LOG(ERR, EAL, "Could not parse %s\n", s); + ret = -EINVAL; + goto get_out; + } + s = strchr(s, '/'); + if (s != NULL) + s++; +next_layer: + i++; + } + + /* Parse each sub-list. */ + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist == NULL) + continue; + kv = &layers[i].kvlist->pairs[0]; + if (strcmp(kv->key, "bus") == 0) { + bus = rte_bus_find_by_name(kv->value); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Could not find bus \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "class") == 0) { + cls = rte_class_find_by_name(kv->value); + if (cls == NULL) { + RTE_LOG(ERR, EAL, "Could not find class \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "driver") == 0) { + /* Ignore */ + continue; + } + } + + /* Fill devargs fields. */ + devargs->bus_str = layers[0].str; + devargs->cls_str = layers[1].str; + devargs->drv_str = layers[2].str; + devargs->bus = bus; + devargs->cls = cls; + + /* If we own the data, clean up a bit + * the several layers string, to ease + * their parsing afterward. + */ + if (devargs->data != devstr) { + char *s = (void *)(intptr_t)(devargs->data); + + while ((s = strchr(s, '/'))) { + *s = '\0'; + s++; + } + } + +get_out: + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist) + rte_kvargs_free(layers[i].kvlist); + } + if (ret != 0) + rte_errno = -ret; + return ret; +} + +static int +bus_name_cmp(const struct rte_bus *bus, const void *name) +{ + return strncmp(bus->name, name, strlen(bus->name)); +} + +__rte_experimental +int +rte_devargs_parse(struct rte_devargs *da, const char *dev) +{ + struct rte_bus *bus = NULL; + const char *devname; + const size_t maxlen = sizeof(da->name); + size_t i; + + if (da == NULL) + return -EINVAL; + + /* Retrieve eventual bus info */ + do { + devname = dev; + bus = rte_bus_find(bus, bus_name_cmp, dev); + if (bus == NULL) + break; + devname = dev + strlen(bus->name) + 1; + if (rte_bus_find_by_device_name(devname) == bus) + break; + } while (1); + /* Store device name */ + i = 0; + while (devname[i] != '\0' && devname[i] != ',') { + da->name[i] = devname[i]; + i++; + if (i == maxlen) { + RTE_LOG(WARNING, EAL, "Parsing \"%s\": device name should be shorter than %zu\n", + dev, maxlen); + da->name[i - 1] = '\0'; + return -EINVAL; + } + } + da->name[i] = '\0'; + if (bus == NULL) { + bus = rte_bus_find_by_device_name(da->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "failed to parse device \"%s\"\n", + da->name); + return -EFAULT; + } + } + da->bus = bus; + /* Parse eventual device arguments */ + if (devname[i] == ',') + da->args = strdup(&devname[i + 1]); + else + da->args = strdup(""); + if (da->args == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse arguments\n"); + return -ENOMEM; + } + return 0; +} + +__rte_experimental +int +rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) +{ + va_list ap; + size_t len; + char *dev; + int rc; + + if (da == NULL) + return -EINVAL; + + va_start(ap, format); + len = vsnprintf(NULL, 0, format, ap); + va_end(ap); + + dev = calloc(1, len + 1); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse device\n"); + return -ENOMEM; + } + + va_start(ap, format); + vsnprintf(dev, len + 1, format, ap); + va_end(ap); + + rc = rte_devargs_parse(da, dev); + free(dev); + return rc; +} + +int __rte_experimental +rte_devargs_insert(struct rte_devargs *da) +{ + int ret; + + ret = rte_devargs_remove(da->bus->name, da->name); + if (ret < 0) + return ret; + TAILQ_INSERT_TAIL(&devargs_list, da, next); + return 0; +} + +/* store a whitelist parameter for later parsing */ +__rte_experimental +int +rte_devargs_add(enum rte_devtype devtype, const char *devargs_str) +{ + struct rte_devargs *devargs = NULL; + struct rte_bus *bus = NULL; + const char *dev = devargs_str; + + /* use calloc instead of rte_zmalloc as it's called early at init */ + devargs = calloc(1, sizeof(*devargs)); + if (devargs == NULL) + goto fail; + + if (rte_devargs_parse(devargs, dev)) + goto fail; + devargs->type = devtype; + bus = devargs->bus; + if (devargs->type == RTE_DEVTYPE_BLACKLISTED_PCI) + devargs->policy = RTE_DEV_BLACKLISTED; + if (bus->conf.scan_mode == RTE_BUS_SCAN_UNDEFINED) { + if (devargs->policy == RTE_DEV_WHITELISTED) + bus->conf.scan_mode = RTE_BUS_SCAN_WHITELIST; + else if (devargs->policy == RTE_DEV_BLACKLISTED) + bus->conf.scan_mode = RTE_BUS_SCAN_BLACKLIST; + } + TAILQ_INSERT_TAIL(&devargs_list, devargs, next); + return 0; + +fail: + if (devargs) { + free(devargs->args); + free(devargs); + } + + return -1; +} + +int __rte_experimental +rte_devargs_remove(const char *busname, const char *devname) +{ + struct rte_devargs *d; + void *tmp; + + TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) { + if (strcmp(d->bus->name, busname) == 0 && + strcmp(d->name, devname) == 0) { + TAILQ_REMOVE(&devargs_list, d, next); + free(d->args); + free(d); + return 0; + } + } + return 1; +} + +/* count the number of devices of a specified type */ +__rte_experimental +unsigned int +rte_devargs_type_count(enum rte_devtype devtype) +{ + struct rte_devargs *devargs; + unsigned int count = 0; + + TAILQ_FOREACH(devargs, &devargs_list, next) { + if (devargs->type != devtype) + continue; + count++; + } + return count; +} + +/* dump the user devices on the console */ +__rte_experimental +void +rte_devargs_dump(FILE *f) +{ + struct rte_devargs *devargs; + + fprintf(f, "User device list:\n"); + TAILQ_FOREACH(devargs, &devargs_list, next) { + fprintf(f, " [%s]: %s %s\n", + (devargs->bus ? devargs->bus->name : "??"), + devargs->name, devargs->args); + } +} + +/* bus-aware rte_devargs iterator. */ +__rte_experimental +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start) +{ + struct rte_devargs *da; + + if (start != NULL) + da = TAILQ_NEXT(start, next); + else + da = TAILQ_FIRST(&devargs_list); + while (da != NULL) { + if (busname == NULL || + (strcmp(busname, da->bus->name) == 0)) + return da; + da = TAILQ_NEXT(da, next); + } + return NULL; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c new file mode 100644 index 00000000..56b492f5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +RTE_DEFINE_PER_LCORE(int, _rte_errno); + +const char * +rte_strerror(int errnum) +{ + /* BSD puts a colon in the "unknown error" messages, Linux doesn't */ +#ifdef RTE_EXEC_ENV_BSDAPP + static const char *sep = ":"; +#else + static const char *sep = ""; +#endif +#define RETVAL_SZ 256 + static RTE_DEFINE_PER_LCORE(char[RETVAL_SZ], retval); + char *ret = RTE_PER_LCORE(retval); + + /* since some implementations of strerror_r throw an error + * themselves if errnum is too big, we handle that case here */ + if (errnum >= RTE_MAX_ERRNO) + snprintf(ret, RETVAL_SZ, "Unknown error%s %d", sep, errnum); + else + switch (errnum){ + case E_RTE_SECONDARY: + return "Invalid call in secondary process"; + case E_RTE_NO_CONFIG: + return "Missing rte_config structure"; + default: + if (strerror_r(errnum, ret, RETVAL_SZ) != 0) + snprintf(ret, RETVAL_SZ, "Unknown error%s %d", + sep, errnum); + } + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c new file mode 100644 index 00000000..ba6c4ae3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c @@ -0,0 +1,1243 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_private.h" + +#include "rte_fbarray.h" + +#define MASK_SHIFT 6ULL +#define MASK_ALIGN (1ULL << MASK_SHIFT) +#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT) +#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN)) +#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod) + +/* + * This is a mask that is always stored at the end of array, to provide fast + * way of finding free/used spots without looping through each element. + */ + +struct used_mask { + unsigned int n_masks; + uint64_t data[]; +}; + +static size_t +calc_mask_size(unsigned int len) +{ + /* mask must be multiple of MASK_ALIGN, even though length of array + * itself may not be aligned on that boundary. + */ + len = RTE_ALIGN_CEIL(len, MASK_ALIGN); + return sizeof(struct used_mask) + + sizeof(uint64_t) * MASK_LEN_TO_IDX(len); +} + +static size_t +calc_data_size(size_t page_sz, unsigned int elt_sz, unsigned int len) +{ + size_t data_sz = elt_sz * len; + size_t msk_sz = calc_mask_size(len); + return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz); +} + +static struct used_mask * +get_used_mask(void *data, unsigned int elt_sz, unsigned int len) +{ + return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len); +} + +static int +resize_and_map(int fd, void *addr, size_t len) +{ + char path[PATH_MAX]; + void *map_addr; + + if (ftruncate(fd, len)) { + RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + + map_addr = mmap(addr, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr != addr) { + RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno)); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + return 0; +} + +static int +find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookahead_idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-1ULL << last_mod); + + for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) { + uint64_t cur_msk, lookahead_msk; + unsigned int run_start, clz, left; + bool found = false; + /* + * The process of getting n consecutive bits for arbitrary n is + * a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * rshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count leading zeroes (that is, count + * how many consecutive set bits we had starting from the + * end of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits right at the start, so we will do + * (n-k-1) rshift-ands and check if first bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* combine current ignore mask with last index ignore mask */ + if (msk_idx == last) + ignore_msk |= last_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk >> 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + run_start = __builtin_ctzll(tmp_msk); + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count leading zeroes on inverted mask */ + if (~cur_msk == 0) + clz = sizeof(cur_msk) * 8; + else + clz = __builtin_clzll(~cur_msk); + + /* if there aren't any runs at the end either, just continue */ + if (clz == 0) + continue; + + /* we have a partial run at the end, so try looking ahead */ + run_start = MASK_ALIGN - clz; + left -= clz; + + for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks; + lookahead_idx++) { + unsigned int s_idx, need; + lookahead_msk = msk->data[lookahead_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookahead_msk = ~lookahead_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookahead_msk &= lookahead_msk >> 1ULL; + + /* if first bit is not set, we've lost the run */ + if ((lookahead_msk & 1) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookahead-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = ~((1ULL << need) - 1); + msk_idx = lookahead_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } + + /* we didn't find anything, so continue */ + if (!found) + continue; + + return MASK_GET_IDX(msk_idx, run_start); + } + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_next(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1ULL); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + for (idx = first; idx < msk->n_masks; idx++) { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find first set bit - that will correspond to whatever it is + * that we're looking for. + */ + found = __builtin_ctzll(cur); + return MASK_GET_IDX(idx, found); + } + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk; + unsigned int need_len, result = 0; + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + for (idx = first; idx < msk->n_masks; idx++, result += need_len) { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* if this is last mask, ignore everything after last bit */ + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) { + cur >>= first_mod; + /* at the start, we don't need the full mask len */ + need_len -= first_mod; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + continue; + + /* + * see if current run ends before mask end. + */ + run_len = __builtin_ctzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } + } + return result; +} + +static int +find_prev_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookbehind_idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + msk_idx = first; + do { + uint64_t cur_msk, lookbehind_msk; + unsigned int run_start, run_end, ctz, left; + bool found = false; + /* + * The process of getting n consecutive bits from the top for + * arbitrary n is a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * lshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count trailing zeroes (that is, count + * how many consecutive set bits we had starting from the + * start of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits at the end, so we will do (n-k-1) + * lshift-ands and check if last bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk << 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + /* clz will give us offset from end of mask, and + * we only get the end of our run, not start, + * so adjust result to point to where start + * would have been. + */ + run_start = MASK_ALIGN - + __builtin_clzll(tmp_msk) - n; + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count trailing zeroes on inverted mask */ + if (~cur_msk == 0) + ctz = sizeof(cur_msk) * 8; + else + ctz = __builtin_ctzll(~cur_msk); + + /* if there aren't any runs at the start either, just + * continue + */ + if (ctz == 0) + continue; + + /* we have a partial run at the start, so try looking behind */ + run_end = MASK_GET_IDX(msk_idx, ctz); + left -= ctz; + + /* go backwards, include zero */ + lookbehind_idx = msk_idx - 1; + + /* we can't lookbehind as we've run out of masks, so stop */ + if (msk_idx == 0) + break; + + do { + const uint64_t last_bit = 1ULL << (MASK_ALIGN - 1); + unsigned int s_idx, need; + + lookbehind_msk = msk->data[lookbehind_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookbehind_msk = ~lookbehind_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookbehind_msk &= lookbehind_msk << 1ULL; + + /* if last bit is not set, we've lost the run */ + if ((lookbehind_msk & last_bit) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookbehind-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = -1ULL << need; + msk_idx = lookbehind_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } while ((lookbehind_idx--) != 0); /* decrement after check to + * include zero + */ + + /* we didn't find anything, so continue */ + if (!found) + continue; + + /* we've found what we were looking for, but we only know where + * the run ended, so calculate start position. + */ + return run_end - n; + } while (msk_idx-- != 0); /* decrement after check to include zero */ + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_prev(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing clz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find last set bit - that will correspond to whatever it is + * that we're looking for. we're counting trailing zeroes, thus + * the value we get is counted from end of mask, so calculate + * position from start of mask. + */ + found = MASK_ALIGN - __builtin_clzll(cur) - 1; + + return MASK_GET_IDX(idx, found); + } while (idx-- != 0); /* decrement after check to include zero*/ + + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_rev_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int need_len, result = 0; + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything after start on first iteration */ + if (idx == first) { + unsigned int end_len = MASK_ALIGN - first_mod - 1; + cur <<= end_len; + /* at the start, we don't need the full mask len */ + need_len -= end_len; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + goto endloop; + + /* + * see where run ends, starting from the end. + */ + run_len = __builtin_clzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } +endloop: + result += need_len; + } while (idx-- != 0); /* decrement after check to include zero */ + return result; +} + +static int +set_used(struct rte_fbarray *arr, unsigned int idx, bool used) +{ + struct used_mask *msk; + uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + unsigned int msk_idx = MASK_LEN_TO_IDX(idx); + bool already_used; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + ret = 0; + + /* prevent array from changing under us */ + rte_rwlock_write_lock(&arr->rwlock); + + already_used = (msk->data[msk_idx] & msk_bit) != 0; + + /* nothing to be done */ + if (used == already_used) + goto out; + + if (used) { + msk->data[msk_idx] |= msk_bit; + arr->count++; + } else { + msk->data[msk_idx] &= ~msk_bit; + arr->count--; + } +out: + rte_rwlock_write_unlock(&arr->rwlock); + + return ret; +} + +static int +fully_validate(const char *name, unsigned int elt_sz, unsigned int len) +{ + if (name == NULL || elt_sz == 0 || len == 0 || len > INT_MAX) { + rte_errno = EINVAL; + return -1; + } + + if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +int __rte_experimental +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz) +{ + size_t page_sz, mmap_len; + char path[PATH_MAX]; + struct used_mask *msk; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + if (fully_validate(name, elt_sz, len)) + return -1; + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) + goto fail; + + /* calculate our memory limits */ + mmap_len = calc_data_size(page_sz, elt_sz, len); + + data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0); + if (data == NULL) + goto fail; + + if (internal_config.no_shconf) { + /* remap virtual area as writable */ + void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (new_data == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n", + __func__, strerror(errno)); + goto fail; + } + } else { + eal_get_fbarray_path(path, sizeof(path), name); + + /* + * Each fbarray is unique to process namespace, i.e. the + * filename depends on process prefix. Try to take out a lock + * and see if we succeed. If we don't, someone else is using it + * already. + */ + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = errno; + goto fail; + } else if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = EBUSY; + goto fail; + } + + /* take out a non-exclusive lock, so that other processes could + * still attach to it, but no other process could reinitialize + * it. + */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + + /* we've mmap'ed the file, we can now close the fd */ + close(fd); + } + + /* initialize the data */ + memset(data, 0, mmap_len); + + /* populate data structure */ + strlcpy(arr->name, name, sizeof(arr->name)); + arr->data = data; + arr->len = len; + arr->elt_sz = elt_sz; + arr->count = 0; + + msk = get_used_mask(data, elt_sz, len); + msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN)); + + rte_rwlock_init(&arr->rwlock); + + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + return -1; +} + +int __rte_experimental +rte_fbarray_attach(struct rte_fbarray *arr) +{ + size_t page_sz, mmap_len; + char path[PATH_MAX]; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize attach as two values we need (element + * size and array length) are constant for the duration of life of + * the array, so the parts we care about will not race. + */ + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) + return -1; + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) + goto fail; + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0); + if (data == NULL) + goto fail; + + eal_get_fbarray_path(path, sizeof(path), arr->name); + + fd = open(path, O_RDWR); + if (fd < 0) { + rte_errno = errno; + goto fail; + } + + /* lock the file, to let others know we're using it */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + + close(fd); + + /* we're done */ + + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + return -1; +} + +int __rte_experimental +rte_fbarray_detach(struct rte_fbarray *arr) +{ + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize detach as two values we need (element + * size and total capacity) are constant for the duration of life of + * the array, so the parts we care about will not race. if the user is + * detaching while doing something else in the same process, we can't + * really do anything about it, things will blow up either way. + */ + + size_t page_sz = sysconf(_SC_PAGESIZE); + + if (page_sz == (size_t)-1) + return -1; + + /* this may already be unmapped (e.g. repeated call from previously + * failed destroy(), but this is on user, we can't (easily) know if this + * is still mapped. + */ + munmap(arr->data, calc_data_size(page_sz, arr->elt_sz, arr->len)); + + return 0; +} + +int __rte_experimental +rte_fbarray_destroy(struct rte_fbarray *arr) +{ + int fd, ret; + char path[PATH_MAX]; + + ret = rte_fbarray_detach(arr); + if (ret) + return ret; + + /* with no shconf, there were never any files to begin with */ + if (internal_config.no_shconf) + return 0; + + /* try deleting the file */ + eal_get_fbarray_path(path, sizeof(path), arr->name); + + fd = open(path, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open fbarray file: %s\n", + strerror(errno)); + return -1; + } + if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n"); + rte_errno = EBUSY; + ret = -1; + } else { + ret = 0; + unlink(path); + memset(arr, 0, sizeof(*arr)); + } + close(fd); + + return ret; +} + +void * __rte_experimental +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx) +{ + void *ret = NULL; + if (arr == NULL) { + rte_errno = EINVAL; + return NULL; + } + + if (idx >= arr->len) { + rte_errno = EINVAL; + return NULL; + } + + ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz); + + return ret; +} + +int __rte_experimental +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, true); +} + +int __rte_experimental +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, false); +} + +int __rte_experimental +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx) +{ + struct used_mask *msk; + int msk_idx; + uint64_t msk_bit; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + msk_idx = MASK_LEN_TO_IDX(idx); + msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + + ret = (msk->data[msk_idx] & msk_bit) != 0; + + rte_rwlock_read_unlock(&arr->rwlock); + + return ret; +} + +static int +fbarray_find(struct rte_fbarray *arr, unsigned int start, bool next, bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = start; + goto out; + } + } else { + if (arr->count == 0) { + rte_errno = ENOENT; + goto out; + } + if (arr->len == arr->count) { + ret = start; + goto out; + } + } + if (next) + ret = find_next(arr, start, used); + else + ret = find_prev(arr, start, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, false); +} + +int __rte_experimental +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, true); +} + +int __rte_experimental +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, false); +} + +int __rte_experimental +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, true); +} + +static int +fbarray_find_n(struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool next, bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len || n > arr->len || n == 0) { + rte_errno = EINVAL; + return -1; + } + if (next && (arr->len - start) < n) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + if (!next && start < (n - 1)) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count || arr->len - arr->count < n) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = next ? start : start - n + 1; + goto out; + } + } else { + if (arr->count < n) { + rte_errno = ENOENT; + goto out; + } + if (arr->count == arr->len) { + ret = next ? start : start - n + 1; + goto out; + } + } + + if (next) + ret = find_next_n(arr, start, n, used); + else + ret = find_prev_n(arr, start, n, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, true, false); +} + +int __rte_experimental +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, true, true); +} + +int __rte_experimental +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, false); +} + +int __rte_experimental +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, true); +} + +static int +fbarray_find_contig(struct rte_fbarray *arr, unsigned int start, bool next, + bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (used) { + if (arr->count == 0) { + ret = 0; + goto out; + } + if (next && arr->count == arr->len) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == arr->len) { + ret = start + 1; + goto out; + } + } else { + if (arr->len == arr->count) { + ret = 0; + goto out; + } + if (next && arr->count == 0) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == 0) { + ret = start + 1; + goto out; + } + } + + if (next) + ret = find_contig(arr, start, used); + else + ret = find_rev_contig(arr, start, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, false); +} + +int __rte_experimental +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, true); +} + +int __rte_experimental +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, false); +} + +int __rte_experimental +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, true); +} + +int __rte_experimental +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt) +{ + void *end; + int ret = -1; + + /* + * no need to synchronize as it doesn't matter if underlying data + * changes - we're doing pointer arithmetic here. + */ + + if (arr == NULL || elt == NULL) { + rte_errno = EINVAL; + return -1; + } + end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len); + if (elt < arr->data || elt >= end) { + rte_errno = EINVAL; + return -1; + } + + ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz; + + return ret; +} + +void __rte_experimental +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f) +{ + struct used_mask *msk; + unsigned int i; + + if (arr == NULL || f == NULL) { + rte_errno = EINVAL; + return; + } + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) { + fprintf(f, "Invalid file-backed array\n"); + goto out; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + fprintf(f, "File-backed array: %s\n", arr->name); + fprintf(f, "size: %i occupied: %i elt_sz: %i\n", + arr->len, arr->count, arr->elt_sz); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + + for (i = 0; i < msk->n_masks; i++) + fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]); +out: + rte_rwlock_read_unlock(&arr->rwlock); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c new file mode 100644 index 00000000..9ca7c511 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include + +#define LINE_LEN 128 + +/**************************************************************************//** +* +* rte_hexdump - Dump out memory in a special hex dump format. +* +* DESCRIPTION +* Dump out the message buffer in a special hex dump output format with characters +* printed for each line of 16 hex values. +* +* RETURNS: N/A +* +* SEE ALSO: +*/ + +void +rte_hexdump(FILE *f, const char * title, const void * buf, unsigned int len) +{ + unsigned int i, out, ofs; + const unsigned char *data = buf; + char line[LINE_LEN]; /* space needed 8+16*3+3+16 == 75 */ + + fprintf(f, "%s at [%p], len=%u\n", (title)? title : " Dump data", data, len); + ofs = 0; + while (ofs < len) { + /* format the line in the buffer, then use printf to output to screen */ + out = snprintf(line, LINE_LEN, "%08X:", ofs); + for (i = 0; ((ofs + i) < len) && (i < 16); i++) + out += snprintf(line+out, LINE_LEN - out, " %02X", (data[ofs+i] & 0xff)); + for(; i <= 16; i++) + out += snprintf(line+out, LINE_LEN - out, " | "); + for(i = 0; (ofs < len) && (i < 16); i++, ofs++) { + unsigned char c = data[ofs]; + if ( (c < ' ') || (c > '~')) + c = '.'; + out += snprintf(line+out, LINE_LEN - out, "%c", c); + } + fprintf(f, "%s\n", line); + } + fflush(f); +} + +/**************************************************************************//** +* +* rte_memdump - Dump out memory in hex bytes with colons. +* +* DESCRIPTION +* Dump out the message buffer in hex bytes with colons xx:xx:xx:xx:... +* +* RETURNS: N/A +* +* SEE ALSO: +*/ + +void +rte_memdump(FILE *f, const char * title, const void * buf, unsigned int len) +{ + unsigned int i, out; + const unsigned char *data = buf; + char line[LINE_LEN]; + + if ( title ) + fprintf(f, "%s: ", title); + + line[0] = '\0'; + for (i = 0, out = 0; i < len; i++) { + // Make sure we do not overrun the line buffer length. + if ( out >= (LINE_LEN - 4) ) { + fprintf(f, "%s", line); + out = 0; + line[out] = '\0'; + } + out += snprintf(line+out, LINE_LEN - out, "%02x%s", + (data[i] & 0xff), ((i+1) < len)? ":" : ""); + } + if ( out > 0 ) + fprintf(f, "%s", line); + fprintf(f, "\n"); + + fflush(f); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c new file mode 100644 index 00000000..5388b81a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +const char * +rte_hypervisor_get_name(enum rte_hypervisor id) +{ + switch (id) { + case RTE_HYPERVISOR_NONE: + return "none"; + case RTE_HYPERVISOR_KVM: + return "KVM"; + case RTE_HYPERVISOR_HYPERV: + return "Hyper-V"; + case RTE_HYPERVISOR_VMWARE: + return "VMware"; + default: + return "unknown"; + } +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c new file mode 100644 index 00000000..fe0ba3f0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * Wait until a lcore finished its job. + */ +int +rte_eal_wait_lcore(unsigned slave_id) +{ + if (lcore_config[slave_id].state == WAIT) + return 0; + + while (lcore_config[slave_id].state != WAIT && + lcore_config[slave_id].state != FINISHED) + rte_pause(); + + rte_rmb(); + + /* we are in finished state, go to wait state */ + lcore_config[slave_id].state = WAIT; + return lcore_config[slave_id].ret; +} + +/* + * Check that every SLAVE lcores are in WAIT state, then call + * rte_eal_remote_launch() for all of them. If call_master is true + * (set to CALL_MASTER), also call the function on the master lcore. + */ +int +rte_eal_mp_remote_launch(int (*f)(void *), void *arg, + enum rte_rmt_call_master_t call_master) +{ + int lcore_id; + int master = rte_get_master_lcore(); + + /* check state of lcores */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (lcore_config[lcore_id].state != WAIT) + return -EBUSY; + } + + /* send messages to cores */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_remote_launch(f, arg, lcore_id); + } + + if (call_master == CALL_MASTER) { + lcore_config[master].ret = f(arg); + lcore_config[master].state = FINISHED; + } + + return 0; +} + +/* + * Return the state of the lcore identified by slave_id. + */ +enum rte_lcore_state_t +rte_eal_get_lcore_state(unsigned lcore_id) +{ + return lcore_config[lcore_id].state; +} + +/* + * Do a rte_eal_wait_lcore() for every lcore. The return values are + * ignored. + */ +void +rte_eal_mp_wait_lcore(void) +{ + unsigned lcore_id; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_wait_lcore(lcore_id); + } +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c new file mode 100644 index 00000000..3167e9d7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +static int +socket_id_cmp(const void *a, const void *b) +{ + const int *lcore_id_a = a; + const int *lcore_id_b = b; + + if (*lcore_id_a < *lcore_id_b) + return -1; + if (*lcore_id_a > *lcore_id_b) + return 1; + return 0; +} + +/* + * Parse /sys/devices/system/cpu to get the number of physical and logical + * processors on the machine. The function will fill the cpu_info + * structure. + */ +int +rte_eal_cpu_init(void) +{ + /* pointer to global configuration */ + struct rte_config *config = rte_eal_get_configuration(); + unsigned lcore_id; + unsigned count = 0; + unsigned int socket_id, prev_socket_id; + int lcore_to_socket_id[RTE_MAX_LCORE]; + + /* + * Parse the maximum set of logical cores, detect the subset of running + * ones and enable them by default. + */ + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + lcore_config[lcore_id].core_index = count; + + /* init cpuset for per lcore config */ + CPU_ZERO(&lcore_config[lcore_id].cpuset); + + /* find socket first */ + socket_id = eal_cpu_socket_id(lcore_id); + if (socket_id >= RTE_MAX_NUMA_NODES) { +#ifdef RTE_EAL_ALLOW_INV_SOCKET_ID + socket_id = 0; +#else + RTE_LOG(ERR, EAL, "Socket ID (%u) is greater than RTE_MAX_NUMA_NODES (%d)\n", + socket_id, RTE_MAX_NUMA_NODES); + return -1; +#endif + } + lcore_to_socket_id[lcore_id] = socket_id; + + /* in 1:1 mapping, record related cpu detected state */ + lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id); + if (lcore_config[lcore_id].detected == 0) { + config->lcore_role[lcore_id] = ROLE_OFF; + lcore_config[lcore_id].core_index = -1; + continue; + } + + /* By default, lcore 1:1 map to cpu id */ + CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset); + + /* By default, each detected core is enabled */ + config->lcore_role[lcore_id] = ROLE_RTE; + lcore_config[lcore_id].core_role = ROLE_RTE; + lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id); + lcore_config[lcore_id].socket_id = socket_id; + RTE_LOG(DEBUG, EAL, "Detected lcore %u as " + "core %u on socket %u\n", + lcore_id, lcore_config[lcore_id].core_id, + lcore_config[lcore_id].socket_id); + count++; + } + /* Set the count of enabled logical cores of the EAL configuration */ + config->lcore_count = count; + RTE_LOG(DEBUG, EAL, + "Support maximum %u logical core(s) by configuration.\n", + RTE_MAX_LCORE); + RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count); + + /* sort all socket id's in ascending order */ + qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id), + sizeof(lcore_to_socket_id[0]), socket_id_cmp); + + prev_socket_id = -1; + config->numa_node_count = 0; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + socket_id = lcore_to_socket_id[lcore_id]; + if (socket_id != prev_socket_id) + config->numa_nodes[config->numa_node_count++] = + socket_id; + prev_socket_id = socket_id; + } + RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count); + + return 0; +} + +unsigned int __rte_experimental +rte_socket_count(void) +{ + const struct rte_config *config = rte_eal_get_configuration(); + return config->numa_node_count; +} + +int __rte_experimental +rte_socket_id_by_idx(unsigned int idx) +{ + const struct rte_config *config = rte_eal_get_configuration(); + if (idx >= config->numa_node_count) { + rte_errno = EINVAL; + return -1; + } + return config->numa_nodes[idx]; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c new file mode 100644 index 00000000..c714a4bd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c @@ -0,0 +1,460 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "eal_private.h" + +/* global log structure */ +struct rte_logs rte_logs = { + .type = ~0, + .level = RTE_LOG_DEBUG, + .file = NULL, +}; + +struct rte_eal_opt_loglevel { + /** Next list entry */ + TAILQ_ENTRY(rte_eal_opt_loglevel) next; + /** Compiled regular expression obtained from the option */ + regex_t re_match; + /** Glob match string option */ + char *pattern; + /** Log level value obtained from the option */ + uint32_t level; +}; + +TAILQ_HEAD(rte_eal_opt_loglevel_list, rte_eal_opt_loglevel); + +/** List of valid EAL log level options */ +static struct rte_eal_opt_loglevel_list opt_loglevel_list = + TAILQ_HEAD_INITIALIZER(opt_loglevel_list); + +/* Stream to use for logging if rte_logs.file is NULL */ +static FILE *default_log_stream; + +/** + * This global structure stores some informations about the message + * that is currently being processed by one lcore + */ +struct log_cur_msg { + uint32_t loglevel; /**< log level - see rte_log.h */ + uint32_t logtype; /**< log type - see rte_log.h */ +}; + +struct rte_log_dynamic_type { + const char *name; + uint32_t loglevel; +}; + + /* per core log */ +static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg); + +/* default logs */ + +/* Change the stream that will be used by logging system */ +int +rte_openlog_stream(FILE *f) +{ + rte_logs.file = f; + return 0; +} + +/* Set global log level */ +void +rte_log_set_global_level(uint32_t level) +{ + rte_logs.level = (uint32_t)level; +} + +/* Get global log level */ +uint32_t +rte_log_get_global_level(void) +{ + return rte_logs.level; +} + +int +rte_log_get_level(uint32_t type) +{ + if (type >= rte_logs.dynamic_types_len) + return -1; + + return rte_logs.dynamic_types[type].loglevel; +} + +int +rte_log_set_level(uint32_t type, uint32_t level) +{ + if (type >= rte_logs.dynamic_types_len) + return -1; + if (level > RTE_LOG_DEBUG) + return -1; + + rte_logs.dynamic_types[type].loglevel = level; + + return 0; +} + +/* set log level by regular expression */ +int +rte_log_set_level_regexp(const char *regex, uint32_t level) +{ + regex_t r; + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + if (regcomp(&r, regex, 0) != 0) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + if (regexec(&r, rte_logs.dynamic_types[i].name, 0, + NULL, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + regfree(&r); + + return 0; +} + +/* + * Save the type string and the loglevel for later dynamic + * logtypes which may register later. + */ +static int rte_log_save_level(int priority, + const char *regex, const char *pattern) +{ + struct rte_eal_opt_loglevel *opt_ll = NULL; + + opt_ll = malloc(sizeof(*opt_ll)); + if (opt_ll == NULL) + goto fail; + + opt_ll->level = priority; + + if (regex) { + opt_ll->pattern = NULL; + if (regcomp(&opt_ll->re_match, regex, 0) != 0) + goto fail; + } else if (pattern) { + opt_ll->pattern = strdup(pattern); + if (opt_ll->pattern == NULL) + goto fail; + } else + goto fail; + + TAILQ_INSERT_HEAD(&opt_loglevel_list, opt_ll, next); + return 0; +fail: + free(opt_ll); + return -1; +} + +int rte_log_save_regexp(const char *regex, int tmp) +{ + return rte_log_save_level(tmp, regex, NULL); +} + +/* set log level based on glob (file match) pattern */ +int +rte_log_set_level_pattern(const char *pattern, uint32_t level) +{ + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + + if (fnmatch(pattern, rte_logs.dynamic_types[i].name, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + return 0; +} + +int rte_log_save_pattern(const char *pattern, int priority) +{ + return rte_log_save_level(priority, NULL, pattern); +} + +/* get the current loglevel for the message being processed */ +int rte_log_cur_msg_loglevel(void) +{ + return RTE_PER_LCORE(log_cur_msg).loglevel; +} + +/* get the current logtype for the message being processed */ +int rte_log_cur_msg_logtype(void) +{ + return RTE_PER_LCORE(log_cur_msg).logtype; +} + +static int +rte_log_lookup(const char *name) +{ + size_t i; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + if (strcmp(name, rte_logs.dynamic_types[i].name) == 0) + return i; + } + + return -1; +} + +/* register an extended log type, assuming table is large enough, and id + * is not yet registered. + */ +static int +__rte_log_register(const char *name, int id) +{ + char *dup_name = strdup(name); + + if (dup_name == NULL) + return -ENOMEM; + + rte_logs.dynamic_types[id].name = dup_name; + rte_logs.dynamic_types[id].loglevel = RTE_LOG_INFO; + + return id; +} + +/* register an extended log type */ +int +rte_log_register(const char *name) +{ + struct rte_log_dynamic_type *new_dynamic_types; + int id, ret; + + id = rte_log_lookup(name); + if (id >= 0) + return id; + + new_dynamic_types = realloc(rte_logs.dynamic_types, + sizeof(struct rte_log_dynamic_type) * + (rte_logs.dynamic_types_len + 1)); + if (new_dynamic_types == NULL) + return -ENOMEM; + rte_logs.dynamic_types = new_dynamic_types; + + ret = __rte_log_register(name, rte_logs.dynamic_types_len); + if (ret < 0) + return ret; + + rte_logs.dynamic_types_len++; + + return ret; +} + +/* Register an extended log type and try to pick its level from EAL options */ +int __rte_experimental +rte_log_register_type_and_pick_level(const char *name, uint32_t level_def) +{ + struct rte_eal_opt_loglevel *opt_ll; + uint32_t level = level_def; + int type; + + type = rte_log_register(name); + if (type < 0) + return type; + + TAILQ_FOREACH(opt_ll, &opt_loglevel_list, next) { + if (opt_ll->level > RTE_LOG_DEBUG) + continue; + + if (opt_ll->pattern) { + if (fnmatch(opt_ll->pattern, name, 0)) + level = opt_ll->level; + } else { + if (regexec(&opt_ll->re_match, name, 0, NULL, 0) == 0) + level = opt_ll->level; + } + } + + rte_logs.dynamic_types[type].loglevel = level; + + return type; +} + +struct logtype { + uint32_t log_id; + const char *logtype; +}; + +static const struct logtype logtype_strings[] = { + {RTE_LOGTYPE_EAL, "lib.eal"}, + {RTE_LOGTYPE_MALLOC, "lib.malloc"}, + {RTE_LOGTYPE_RING, "lib.ring"}, + {RTE_LOGTYPE_MEMPOOL, "lib.mempool"}, + {RTE_LOGTYPE_TIMER, "lib.timer"}, + {RTE_LOGTYPE_PMD, "pmd"}, + {RTE_LOGTYPE_HASH, "lib.hash"}, + {RTE_LOGTYPE_LPM, "lib.lpm"}, + {RTE_LOGTYPE_KNI, "lib.kni"}, + {RTE_LOGTYPE_ACL, "lib.acl"}, + {RTE_LOGTYPE_POWER, "lib.power"}, + {RTE_LOGTYPE_METER, "lib.meter"}, + {RTE_LOGTYPE_SCHED, "lib.sched"}, + {RTE_LOGTYPE_PORT, "lib.port"}, + {RTE_LOGTYPE_TABLE, "lib.table"}, + {RTE_LOGTYPE_PIPELINE, "lib.pipeline"}, + {RTE_LOGTYPE_MBUF, "lib.mbuf"}, + {RTE_LOGTYPE_CRYPTODEV, "lib.cryptodev"}, + {RTE_LOGTYPE_EFD, "lib.efd"}, + {RTE_LOGTYPE_EVENTDEV, "lib.eventdev"}, + {RTE_LOGTYPE_GSO, "lib.gso"}, + {RTE_LOGTYPE_USER1, "user1"}, + {RTE_LOGTYPE_USER2, "user2"}, + {RTE_LOGTYPE_USER3, "user3"}, + {RTE_LOGTYPE_USER4, "user4"}, + {RTE_LOGTYPE_USER5, "user5"}, + {RTE_LOGTYPE_USER6, "user6"}, + {RTE_LOGTYPE_USER7, "user7"}, + {RTE_LOGTYPE_USER8, "user8"} +}; + +/* Logging should be first initializer (before drivers and bus) */ +RTE_INIT_PRIO(rte_log_init, LOG) +{ + uint32_t i; + + rte_log_set_global_level(RTE_LOG_DEBUG); + + rte_logs.dynamic_types = calloc(RTE_LOGTYPE_FIRST_EXT_ID, + sizeof(struct rte_log_dynamic_type)); + if (rte_logs.dynamic_types == NULL) + return; + + /* register legacy log types */ + for (i = 0; i < RTE_DIM(logtype_strings); i++) + __rte_log_register(logtype_strings[i].logtype, + logtype_strings[i].log_id); + + rte_logs.dynamic_types_len = RTE_LOGTYPE_FIRST_EXT_ID; +} + +static const char * +loglevel_to_string(uint32_t level) +{ + switch (level) { + case 0: return "disabled"; + case RTE_LOG_EMERG: return "emerg"; + case RTE_LOG_ALERT: return "alert"; + case RTE_LOG_CRIT: return "critical"; + case RTE_LOG_ERR: return "error"; + case RTE_LOG_WARNING: return "warning"; + case RTE_LOG_NOTICE: return "notice"; + case RTE_LOG_INFO: return "info"; + case RTE_LOG_DEBUG: return "debug"; + default: return "unknown"; + } +} + +/* dump global level and registered log types */ +void +rte_log_dump(FILE *f) +{ + size_t i; + + fprintf(f, "global log level is %s\n", + loglevel_to_string(rte_log_get_global_level())); + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + fprintf(f, "id %zu: %s, level is %s\n", + i, rte_logs.dynamic_types[i].name, + loglevel_to_string(rte_logs.dynamic_types[i].loglevel)); + } +} + +/* + * Generates a log message The message will be sent in the stream + * defined by the previous call to rte_openlog_stream(). + */ +int +rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) +{ + int ret; + FILE *f = rte_logs.file; + if (f == NULL) { + f = default_log_stream; + if (f == NULL) { + /* + * Grab the current value of stderr here, rather than + * just initializing default_log_stream to stderr. This + * ensures that we will always use the current value + * of stderr, even if the application closes and + * reopens it. + */ + f = stderr; + } + } + + if (level > rte_logs.level) + return 0; + if (logtype >= rte_logs.dynamic_types_len) + return -1; + if (level > rte_logs.dynamic_types[logtype].loglevel) + return 0; + + /* save loglevel and logtype in a global per-lcore variable */ + RTE_PER_LCORE(log_cur_msg).loglevel = level; + RTE_PER_LCORE(log_cur_msg).logtype = logtype; + + ret = vfprintf(f, format, ap); + fflush(f); + return ret; +} + +/* + * Generates a log message The message will be sent in the stream + * defined by the previous call to rte_openlog_stream(). + * No need to check level here, done by rte_vlog(). + */ +int +rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +{ + va_list ap; + int ret; + + va_start(ap, format); + ret = rte_vlog(level, logtype, format, ap); + va_end(ap); + return ret; +} + +/* + * Called by environment-specific initialization functions. + */ +void +eal_log_set_default(FILE *default_log) +{ + default_log_stream = default_log; + +#if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG + RTE_LOG(NOTICE, EAL, + "Debug dataplane logs available - lower performance\n"); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c new file mode 100644 index 00000000..1d41ea11 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c @@ -0,0 +1,364 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" + +struct mem_event_callback_entry { + TAILQ_ENTRY(mem_event_callback_entry) next; + char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN]; + rte_mem_event_callback_t clb; + void *arg; +}; + +struct mem_alloc_validator_entry { + TAILQ_ENTRY(mem_alloc_validator_entry) next; + char name[RTE_MEM_ALLOC_VALIDATOR_NAME_LEN]; + rte_mem_alloc_validator_t clb; + int socket_id; + size_t limit; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry); +TAILQ_HEAD(mem_alloc_validator_entry_list, mem_alloc_validator_entry); + +static struct mem_event_callback_entry_list mem_event_callback_list = + TAILQ_HEAD_INITIALIZER(mem_event_callback_list); +static rte_rwlock_t mem_event_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_alloc_validator_entry_list mem_alloc_validator_list = + TAILQ_HEAD_INITIALIZER(mem_alloc_validator_list); +static rte_rwlock_t mem_alloc_validator_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_event_callback_entry * +find_mem_event_callback(const char *name, void *arg) +{ + struct mem_event_callback_entry *r; + + TAILQ_FOREACH(r, &mem_event_callback_list, next) { + if (!strcmp(r->name, name) && r->arg == arg) + break; + } + return r; +} + +static struct mem_alloc_validator_entry * +find_mem_alloc_validator(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *r; + + TAILQ_FOREACH(r, &mem_alloc_validator_list, next) { + if (!strcmp(r->name, name) && r->socket_id == socket_id) + break; + } + return r; +} + +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len) +{ + void *end, *aligned_start, *aligned_end; + size_t pgsz = (size_t)msl->page_sz; + const struct rte_memseg *ms; + + /* for IOVA_VA, it's always contiguous */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return true; + + /* for legacy memory, it's always contiguous */ + if (internal_config.legacy_mem) + return true; + + end = RTE_PTR_ADD(start, len); + + /* for nohuge, we check pagemap, otherwise check memseg */ + if (!rte_eal_has_hugepages()) { + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + cur = rte_mem_virt2iova(aligned_start); + expected = cur + pgsz; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + + while (aligned_start < aligned_end) { + cur = rte_mem_virt2iova(aligned_start); + if (cur != expected) + return false; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + expected += pgsz; + } + } else { + int start_seg, end_seg, cur_seg; + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + start_seg = RTE_PTR_DIFF(aligned_start, msl->base_va) / + pgsz; + end_seg = RTE_PTR_DIFF(aligned_end, msl->base_va) / + pgsz; + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + ms = rte_fbarray_get(&msl->memseg_arr, start_seg); + cur = ms->iova; + expected = cur + pgsz; + + /* if we can't access IOVA addresses, assume non-contiguous */ + if (cur == RTE_BAD_IOVA) + return false; + + for (cur_seg = start_seg + 1; cur_seg < end_seg; + cur_seg++, expected += pgsz) { + ms = rte_fbarray_get(&msl->memseg_arr, cur_seg); + + if (ms->iova != expected) + return false; + } + } + return true; +} + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + if (name == NULL || clb == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->arg = arg; + strlcpy(entry->name, name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_event_callback_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' registered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + + if (name == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_event_callback_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' unregistered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len) +{ + struct mem_event_callback_entry *entry; + + rte_rwlock_read_lock(&mem_event_rwlock); + + TAILQ_FOREACH(entry, &mem_event_callback_list, next) { + RTE_LOG(DEBUG, EAL, "Calling mem event callback '%s:%p'\n", + entry->name, entry->arg); + entry->clb(event, start, len, entry->arg); + } + + rte_rwlock_read_unlock(&mem_event_rwlock); +} + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + if (name == NULL || clb == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->socket_id = socket_id; + entry->limit = limit; + strlcpy(entry->name, name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_alloc_validator_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i with limit %zu registered\n", + name, socket_id, limit); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + + if (name == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_alloc_validator_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i unregistered\n", + name, socket_id); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len) +{ + struct mem_alloc_validator_entry *entry; + int ret = 0; + + rte_rwlock_read_lock(&mem_alloc_validator_rwlock); + + TAILQ_FOREACH(entry, &mem_alloc_validator_list, next) { + if (entry->socket_id != socket_id || entry->limit > new_len) + continue; + RTE_LOG(DEBUG, EAL, "Calling mem alloc validator '%s' on socket %i\n", + entry->name, entry->socket_id); + if (entry->clb(socket_id, entry->limit, new_len) < 0) + ret = -1; + } + + rte_rwlock_read_unlock(&mem_alloc_validator_rwlock); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c new file mode 100644 index 00000000..fbfb1b05 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c @@ -0,0 +1,584 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_memalloc.h" +#include "eal_private.h" +#include "eal_internal_cfg.h" + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. + */ + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" + +static void *next_baseaddr; +static uint64_t system_page_sz; + +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags) +{ + bool addr_is_hint, allow_shrink, unmap, no_align; + uint64_t map_sz; + void *mapped_addr, *aligned_addr; + + if (system_page_sz == 0) + system_page_sz = sysconf(_SC_PAGESIZE); + + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; + allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; + unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; + + if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) internal_config.base_virtaddr; + + if (requested_addr == NULL && next_baseaddr != NULL) { + requested_addr = next_baseaddr; + requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); + addr_is_hint = true; + } + + /* we don't need alignment of resulting pointer in the following cases: + * + * 1. page size is equal to system size + * 2. we have a requested address, and it is page-aligned, and we will + * be discarding the address if we get a different one. + * + * for all other cases, alignment is potentially necessary. + */ + no_align = (requested_addr != NULL && + requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && + !addr_is_hint) || + page_sz == system_page_sz; + + do { + map_sz = no_align ? *size : *size + page_sz; + if (map_sz > SIZE_MAX) { + RTE_LOG(ERR, EAL, "Map size too big\n"); + rte_errno = E2BIG; + return NULL; + } + + mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_READ, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; + } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + /* align resulting address - if map failed, we will ignore the value + * anyway, so no need to add additional checks. + */ + aligned_addr = no_align ? mapped_addr : + RTE_PTR_ALIGN(mapped_addr, page_sz); + + if (*size == 0) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", + strerror(errno)); + rte_errno = errno; + return NULL; + } else if (mapped_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + /* pass errno up the call chain */ + rte_errno = errno; + return NULL; + } else if (requested_addr != NULL && !addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", + requested_addr, aligned_addr); + munmap(mapped_addr, map_sz); + rte_errno = EADDRNOTAVAIL; + return NULL; + } else if (requested_addr != NULL && addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", + requested_addr, aligned_addr); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } else if (next_baseaddr != NULL) { + next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); + } + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + aligned_addr, *size); + + if (unmap) { + munmap(mapped_addr, map_sz); + } else if (!no_align) { + void *map_end, *aligned_end; + size_t before_len, after_len; + + /* when we reserve space with alignment, we add alignment to + * mapping size. On 32-bit, if 1GB alignment was requested, this + * would waste 1GB of address space, which is a luxury we cannot + * afford. so, if alignment was performed, check if any unneeded + * address space can be unmapped back. + */ + + map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); + aligned_end = RTE_PTR_ADD(aligned_addr, *size); + + /* unmap space before aligned mmap address */ + before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); + if (before_len > 0) + munmap(mapped_addr, before_len); + + /* unmap space after aligned end mmap address */ + after_len = RTE_PTR_DIFF(map_end, aligned_end); + if (after_len > 0) + munmap(aligned_end, after_len); + } + + return aligned_addr; +} + +static struct rte_memseg * +virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + const struct rte_fbarray *arr; + void *start, *end; + int ms_idx; + + if (msl == NULL) + return NULL; + + /* a memseg list was specified, check if it's the right one */ + start = msl->base_va; + end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); + + if (addr < start || addr >= end) + return NULL; + + /* now, calculate index */ + arr = &msl->memseg_arr; + ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; + return rte_fbarray_get(arr, ms_idx); +} + +static struct rte_memseg_list * +virt2memseg_list(const void *addr) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + int msl_idx; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + void *start, *end; + msl = &mcfg->memsegs[msl_idx]; + + start = msl->base_va; + end = RTE_PTR_ADD(start, + (size_t)msl->page_sz * msl->memseg_arr.len); + if (addr >= start && addr < end) + break; + } + /* if we didn't find our memseg list */ + if (msl_idx == RTE_MAX_MEMSEG_LISTS) + return NULL; + return msl; +} + +__rte_experimental struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *addr) +{ + return virt2memseg_list(addr); +} + +struct virtiova { + rte_iova_t iova; + void *virt; +}; +static int +find_virt(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} +static int +find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} + +__rte_experimental void * +rte_mem_iova2virt(rte_iova_t iova) +{ + struct virtiova vi; + + memset(&vi, 0, sizeof(vi)); + + vi.iova = iova; + /* for legacy mem, we can get away with scanning VA-contiguous segments, + * as we know they are PA-contiguous as well + */ + if (internal_config.legacy_mem) + rte_memseg_contig_walk(find_virt_legacy, &vi); + else + rte_memseg_walk(find_virt, &vi); + + return vi.virt; +} + +__rte_experimental struct rte_memseg * +rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + return virt2memseg(addr, msl != NULL ? msl : + rte_mem_virt2memseg_list(addr)); +} + +static int +physmem_size(const struct rte_memseg_list *msl, void *arg) +{ + uint64_t *total_len = arg; + + *total_len += msl->memseg_arr.count * msl->page_sz; + + return 0; +} + +/* get the total size of memory */ +uint64_t +rte_eal_get_physmem_size(void) +{ + uint64_t total_len = 0; + + rte_memseg_list_walk(physmem_size, &total_len); + + return total_len; +} + +static int +dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx, ms_idx; + FILE *f = arg; + + msl_idx = msl - mcfg->memsegs; + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) + return -1; + + fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " + "virt:%p, socket_id:%"PRId32", " + "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " + "nrank:%"PRIx32"\n", + msl_idx, ms_idx, + ms->iova, + ms->len, + ms->addr, + ms->socket_id, + ms->hugepage_sz, + ms->nchannel, + ms->nrank); + + return 0; +} + +/* + * Defining here because declared in rte_memory.h, but the actual implementation + * is in eal_common_memalloc.c, like all other memalloc internals. + */ +int __rte_experimental +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_register(name, clb, arg); +} + +int __rte_experimental +rte_mem_event_callback_unregister(const char *name, void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_unregister(name, arg); +} + +int __rte_experimental +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, + limit); +} + +int __rte_experimental +rte_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); +} + +/* Dump the physical memory layout on console */ +void +rte_dump_physmem_layout(FILE *f) +{ + rte_memseg_walk(dump_memseg, f); +} + +/* return the number of memory channels */ +unsigned rte_memory_get_nchannel(void) +{ + return rte_eal_get_configuration()->mem_config->nchannel; +} + +/* return the number of memory rank */ +unsigned rte_memory_get_nrank(void) +{ + return rte_eal_get_configuration()->mem_config->nrank; +} + +static int +rte_eal_memdevice_init(void) +{ + struct rte_config *config; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + return 0; + + config = rte_eal_get_configuration(); + config->mem_config->nchannel = internal_config.force_nchannel; + config->mem_config->nrank = internal_config.force_nrank; + + return 0; +} + +/* Lock page in physical memory and prevent from swapping. */ +int +rte_mem_lock_page(const void *virt) +{ + unsigned long virtual = (unsigned long)virt; + int page_size = getpagesize(); + unsigned long aligned = (virtual & ~(page_size - 1)); + return mlock((void *)aligned, page_size); +} + +int __rte_experimental +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + int n_segs; + size_t len; + + ms = rte_fbarray_get(arr, ms_idx); + + /* find how many more segments there are, starting with + * this one. + */ + n_segs = rte_fbarray_find_contig_used(arr, ms_idx); + len = n_segs * msl->page_sz; + + ret = func(msl, ms, len, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, + ms_idx + n_segs); + } + } + return 0; +} + +int __rte_experimental +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_contig_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + ms = rte_fbarray_get(arr, ms_idx); + ret = func(msl, ms, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); + } + } + return 0; +} + +int __rte_experimental +rte_memseg_walk(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret) + return ret; + } + return 0; +} + +int __rte_experimental +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_list_walk_thread_unsafe(func, arg); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +/* init memory subsystem */ +int +rte_eal_memory_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int retval; + RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); + + if (!mcfg) + return -1; + + /* lock mem hotplug here, to prevent races while we init */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + if (rte_eal_memseg_init() < 0) + goto fail; + + if (eal_memalloc_init() < 0) + goto fail; + + retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? + rte_eal_hugepage_init() : + rte_eal_hugepage_attach(); + if (retval < 0) + goto fail; + + if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) + goto fail; + + return 0; +fail: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c new file mode 100644 index 00000000..7300fe05 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c @@ -0,0 +1,408 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "malloc_heap.h" +#include "malloc_elem.h" +#include "eal_private.h" + +static inline const struct rte_memzone * +memzone_lookup_thread_unsafe(const char *name) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + const struct rte_memzone *mz; + int i = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + /* + * the algorithm is not optimal (linear), but there are few + * zones and this function should be called at init only + */ + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + mz = rte_fbarray_get(arr, i); + if (mz->addr != NULL && + !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE)) + return mz; + i = rte_fbarray_find_next_used(arr, i + 1); + } + return NULL; +} + +static const struct rte_memzone * +memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, + int socket_id, unsigned int flags, unsigned int align, + unsigned int bound) +{ + struct rte_memzone *mz; + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + void *mz_addr; + size_t requested_len; + int mz_idx; + bool contig; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + /* no more room in config */ + if (arr->count >= arr->len) { + RTE_LOG(ERR, EAL, "%s(): No more room in config\n", __func__); + rte_errno = ENOSPC; + return NULL; + } + + if (strlen(name) > sizeof(mz->name) - 1) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s>: name too long\n", + __func__, name); + rte_errno = ENAMETOOLONG; + return NULL; + } + + /* zone already exist */ + if ((memzone_lookup_thread_unsafe(name)) != NULL) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s> already exists\n", + __func__, name); + rte_errno = EEXIST; + return NULL; + } + + /* if alignment is not a power of two */ + if (align && !rte_is_power_of_2(align)) { + RTE_LOG(ERR, EAL, "%s(): Invalid alignment: %u\n", __func__, + align); + rte_errno = EINVAL; + return NULL; + } + + /* alignment less than cache size is not allowed */ + if (align < RTE_CACHE_LINE_SIZE) + align = RTE_CACHE_LINE_SIZE; + + /* align length on cache boundary. Check for overflow before doing so */ + if (len > SIZE_MAX - RTE_CACHE_LINE_MASK) { + rte_errno = EINVAL; /* requested size too big */ + return NULL; + } + + len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE); + + /* save minimal requested length */ + requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, len); + + /* check that boundary condition is valid */ + if (bound != 0 && (requested_len > bound || !rte_is_power_of_2(bound))) { + rte_errno = EINVAL; + return NULL; + } + + if ((socket_id != SOCKET_ID_ANY) && + (socket_id >= RTE_MAX_NUMA_NODES || socket_id < 0)) { + rte_errno = EINVAL; + return NULL; + } + + if (!rte_eal_has_hugepages()) + socket_id = SOCKET_ID_ANY; + + contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; + /* malloc only cares about size flags, remove contig flag from flags */ + flags &= ~RTE_MEMZONE_IOVA_CONTIG; + + if (len == 0 && bound == 0) { + /* no size constraints were placed, so use malloc elem len */ + requested_len = 0; + mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags, + align, contig); + } else { + if (len == 0) + requested_len = bound; + /* allocate memory on heap */ + mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, + flags, align, bound, contig); + } + if (mz_addr == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + struct malloc_elem *elem = malloc_elem_from_data(mz_addr); + + /* fill the zone in config */ + mz_idx = rte_fbarray_find_next_free(arr, 0); + + if (mz_idx < 0) { + mz = NULL; + } else { + rte_fbarray_set_used(arr, mz_idx); + mz = rte_fbarray_get(arr, mz_idx); + } + + if (mz == NULL) { + RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone\n", __func__); + malloc_heap_free(elem); + rte_errno = ENOSPC; + return NULL; + } + + snprintf(mz->name, sizeof(mz->name), "%s", name); + mz->iova = rte_malloc_virt2iova(mz_addr); + mz->addr = mz_addr; + mz->len = requested_len == 0 ? + elem->size - elem->pad - MALLOC_ELEM_OVERHEAD : + requested_len; + mz->hugepage_sz = elem->msl->page_sz; + mz->socket_id = elem->msl->socket_id; + mz->flags = 0; + + return mz; +} + +static const struct rte_memzone * +rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id, + unsigned int flags, unsigned int align, unsigned int bound) +{ + struct rte_mem_config *mcfg; + const struct rte_memzone *mz = NULL; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_write_lock(&mcfg->mlock); + + mz = memzone_reserve_aligned_thread_unsafe( + name, len, socket_id, flags, align, bound); + + rte_rwlock_write_unlock(&mcfg->mlock); + + return mz; +} + +/* + * Return a pointer to a correctly filled memzone descriptor (with a + * specified alignment and boundary). If the allocation cannot be done, + * return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align, unsigned bound) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, flags, + align, bound); +} + +/* + * Return a pointer to a correctly filled memzone descriptor (with a + * specified alignment). If the allocation cannot be done, return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, flags, + align, 0); +} + +/* + * Return a pointer to a correctly filled memzone descriptor. If the + * allocation cannot be done, return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve(const char *name, size_t len, int socket_id, + unsigned flags) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, + flags, RTE_CACHE_LINE_SIZE, 0); +} + +int +rte_memzone_free(const struct rte_memzone *mz) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + struct rte_memzone *found_mz; + int ret = 0; + void *addr = NULL; + unsigned idx; + + if (mz == NULL) + return -EINVAL; + + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + rte_rwlock_write_lock(&mcfg->mlock); + + idx = rte_fbarray_find_idx(arr, mz); + found_mz = rte_fbarray_get(arr, idx); + + if (found_mz == NULL) { + ret = -EINVAL; + } else if (found_mz->addr == NULL) { + RTE_LOG(ERR, EAL, "Memzone is not allocated\n"); + ret = -EINVAL; + } else { + addr = found_mz->addr; + memset(found_mz, 0, sizeof(*found_mz)); + rte_fbarray_set_free(arr, idx); + } + + rte_rwlock_write_unlock(&mcfg->mlock); + + if (addr != NULL) + rte_free(addr); + + return ret; +} + +/* + * Lookup for the memzone identified by the given name + */ +const struct rte_memzone * +rte_memzone_lookup(const char *name) +{ + struct rte_mem_config *mcfg; + const struct rte_memzone *memzone = NULL; + + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_read_lock(&mcfg->mlock); + + memzone = memzone_lookup_thread_unsafe(name); + + rte_rwlock_read_unlock(&mcfg->mlock); + + return memzone; +} + +static void +dump_memzone(const struct rte_memzone *mz, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl = NULL; + void *cur_addr, *mz_end; + struct rte_memseg *ms; + int mz_idx, ms_idx; + size_t page_sz; + FILE *f = arg; + + mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz); + + fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, " + "socket_id:%"PRId32", flags:%"PRIx32"\n", + mz_idx, + mz->name, + mz->len, + mz->addr, + mz->socket_id, + mz->flags); + + /* go through each page occupied by this memzone */ + msl = rte_mem_virt2memseg_list(mz->addr); + if (!msl) { + RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n"); + return; + } + page_sz = (size_t)mz->hugepage_sz; + cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz); + mz_end = RTE_PTR_ADD(cur_addr, mz->len); + + fprintf(f, "physical segments used:\n"); + ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz; + ms = rte_fbarray_get(&msl->memseg_arr, ms_idx); + + do { + fprintf(f, " addr: %p iova: 0x%" PRIx64 " " + "len: 0x%zx " + "pagesz: 0x%zx\n", + cur_addr, ms->iova, ms->len, page_sz); + + /* advance VA to next page */ + cur_addr = RTE_PTR_ADD(cur_addr, page_sz); + + /* memzones occupy contiguous segments */ + ++ms; + } while (cur_addr < mz_end); +} + +/* Dump all reserved memory zones on console */ +void +rte_memzone_dump(FILE *f) +{ + rte_memzone_walk(dump_memzone, f); +} + +/* + * Init the memzone subsystem + */ +int +rte_eal_memzone_init(void) +{ + struct rte_mem_config *mcfg; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_write_lock(&mcfg->mlock); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + rte_fbarray_init(&mcfg->memzones, "memzone", + RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) { + RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n"); + return -1; + } else if (rte_eal_process_type() == RTE_PROC_SECONDARY && + rte_fbarray_attach(&mcfg->memzones)) { + RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n"); + rte_rwlock_write_unlock(&mcfg->mlock); + return -1; + } + + rte_rwlock_write_unlock(&mcfg->mlock); + + return 0; +} + +/* Walk all reserved memory zones */ +void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *), + void *arg) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + int i; + + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + rte_rwlock_read_lock(&mcfg->mlock); + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + struct rte_memzone *mz = rte_fbarray_get(arr, i); + (*func)(mz, arg); + i = rte_fbarray_find_next_used(arr, i + 1); + } + rte_rwlock_read_unlock(&mcfg->mlock); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c new file mode 100644 index 00000000..dd5f9740 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c @@ -0,0 +1,1450 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_internal_cfg.h" +#include "eal_options.h" +#include "eal_filesystem.h" +#include "eal_private.h" + +#define BITS_PER_HEX 4 +#define LCORE_OPT_LST 1 +#define LCORE_OPT_MSK 2 +#define LCORE_OPT_MAP 3 + +const char +eal_short_options[] = + "b:" /* pci-blacklist */ + "c:" /* coremask */ + "s:" /* service coremask */ + "d:" /* driver */ + "h" /* help */ + "l:" /* corelist */ + "S:" /* service corelist */ + "m:" /* memory size */ + "n:" /* memory channels */ + "r:" /* memory ranks */ + "v" /* version */ + "w:" /* pci-whitelist */ + ; + +const struct option +eal_long_options[] = { + {OPT_BASE_VIRTADDR, 1, NULL, OPT_BASE_VIRTADDR_NUM }, + {OPT_CREATE_UIO_DEV, 0, NULL, OPT_CREATE_UIO_DEV_NUM }, + {OPT_FILE_PREFIX, 1, NULL, OPT_FILE_PREFIX_NUM }, + {OPT_HELP, 0, NULL, OPT_HELP_NUM }, + {OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM }, + {OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM }, + {OPT_LCORES, 1, NULL, OPT_LCORES_NUM }, + {OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM }, + {OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM }, + {OPT_MBUF_POOL_OPS_NAME, 1, NULL, OPT_MBUF_POOL_OPS_NAME_NUM}, + {OPT_NO_HPET, 0, NULL, OPT_NO_HPET_NUM }, + {OPT_NO_HUGE, 0, NULL, OPT_NO_HUGE_NUM }, + {OPT_NO_PCI, 0, NULL, OPT_NO_PCI_NUM }, + {OPT_NO_SHCONF, 0, NULL, OPT_NO_SHCONF_NUM }, + {OPT_IN_MEMORY, 0, NULL, OPT_IN_MEMORY_NUM }, + {OPT_PCI_BLACKLIST, 1, NULL, OPT_PCI_BLACKLIST_NUM }, + {OPT_PCI_WHITELIST, 1, NULL, OPT_PCI_WHITELIST_NUM }, + {OPT_PROC_TYPE, 1, NULL, OPT_PROC_TYPE_NUM }, + {OPT_SOCKET_MEM, 1, NULL, OPT_SOCKET_MEM_NUM }, + {OPT_SOCKET_LIMIT, 1, NULL, OPT_SOCKET_LIMIT_NUM }, + {OPT_SYSLOG, 1, NULL, OPT_SYSLOG_NUM }, + {OPT_VDEV, 1, NULL, OPT_VDEV_NUM }, + {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, + {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM }, + {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM }, + {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM}, + {0, 0, NULL, 0 } +}; + +TAILQ_HEAD(shared_driver_list, shared_driver); + +/* Definition for shared object drivers. */ +struct shared_driver { + TAILQ_ENTRY(shared_driver) next; + + char name[PATH_MAX]; + void* lib_handle; +}; + +/* List of external loadable drivers */ +static struct shared_driver_list solib_list = +TAILQ_HEAD_INITIALIZER(solib_list); + +/* Default path of external loadable drivers */ +static const char *default_solib_dir = RTE_EAL_PMD_PATH; + +/* + * Stringified version of solib path used by dpdk-pmdinfo.py + * Note: PLEASE DO NOT ALTER THIS without making a corresponding + * change to usertools/dpdk-pmdinfo.py + */ +static const char dpdk_solib_path[] __attribute__((used)) = +"DPDK_PLUGIN_PATH=" RTE_EAL_PMD_PATH; + +TAILQ_HEAD(device_option_list, device_option); + +struct device_option { + TAILQ_ENTRY(device_option) next; + + enum rte_devtype type; + char arg[]; +}; + +static struct device_option_list devopt_list = +TAILQ_HEAD_INITIALIZER(devopt_list); + +static int master_lcore_parsed; +static int mem_parsed; +static int core_parsed; + +static int +eal_option_device_add(enum rte_devtype type, const char *optarg) +{ + struct device_option *devopt; + size_t optlen; + int ret; + + optlen = strlen(optarg) + 1; + devopt = calloc(1, sizeof(*devopt) + optlen); + if (devopt == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate device option\n"); + return -ENOMEM; + } + + devopt->type = type; + ret = snprintf(devopt->arg, optlen, "%s", optarg); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Unable to copy device option\n"); + free(devopt); + return -EINVAL; + } + TAILQ_INSERT_TAIL(&devopt_list, devopt, next); + return 0; +} + +int +eal_option_device_parse(void) +{ + struct device_option *devopt; + void *tmp; + int ret = 0; + + TAILQ_FOREACH_SAFE(devopt, &devopt_list, next, tmp) { + if (ret == 0) { + ret = rte_devargs_add(devopt->type, devopt->arg); + if (ret) + RTE_LOG(ERR, EAL, "Unable to parse device '%s'\n", + devopt->arg); + } + TAILQ_REMOVE(&devopt_list, devopt, next); + free(devopt); + } + return ret; +} + +void +eal_reset_internal_config(struct internal_config *internal_cfg) +{ + int i; + + internal_cfg->memory = 0; + internal_cfg->force_nrank = 0; + internal_cfg->force_nchannel = 0; + internal_cfg->hugefile_prefix = HUGEFILE_PREFIX_DEFAULT; + internal_cfg->hugepage_dir = NULL; + internal_cfg->force_sockets = 0; + /* zero out the NUMA config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_mem[i] = 0; + internal_cfg->force_socket_limits = 0; + /* zero out the NUMA limits config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_limit[i] = 0; + /* zero out hugedir descriptors */ + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) { + memset(&internal_cfg->hugepage_info[i], 0, + sizeof(internal_cfg->hugepage_info[0])); + internal_cfg->hugepage_info[i].lock_descriptor = -1; + } + internal_cfg->base_virtaddr = 0; + + internal_cfg->syslog_facility = LOG_DAEMON; + + /* if set to NONE, interrupt mode is determined automatically */ + internal_cfg->vfio_intr_mode = RTE_INTR_MODE_NONE; + +#ifdef RTE_LIBEAL_USE_HPET + internal_cfg->no_hpet = 0; +#else + internal_cfg->no_hpet = 1; +#endif + internal_cfg->vmware_tsc_map = 0; + internal_cfg->create_uio_dev = 0; + internal_cfg->user_mbuf_pool_ops_name = NULL; + internal_cfg->init_complete = 0; +} + +static int +eal_plugin_add(const char *path) +{ + struct shared_driver *solib; + + solib = malloc(sizeof(*solib)); + if (solib == NULL) { + RTE_LOG(ERR, EAL, "malloc(solib) failed\n"); + return -1; + } + memset(solib, 0, sizeof(*solib)); + strncpy(solib->name, path, PATH_MAX-1); + solib->name[PATH_MAX-1] = 0; + TAILQ_INSERT_TAIL(&solib_list, solib, next); + + return 0; +} + +static int +eal_plugindir_init(const char *path) +{ + DIR *d = NULL; + struct dirent *dent = NULL; + char sopath[PATH_MAX]; + + if (path == NULL || *path == '\0') + return 0; + + d = opendir(path); + if (d == NULL) { + RTE_LOG(ERR, EAL, "failed to open directory %s: %s\n", + path, strerror(errno)); + return -1; + } + + while ((dent = readdir(d)) != NULL) { + struct stat sb; + + snprintf(sopath, PATH_MAX-1, "%s/%s", path, dent->d_name); + sopath[PATH_MAX-1] = 0; + + if (!(stat(sopath, &sb) == 0 && S_ISREG(sb.st_mode))) + continue; + + if (eal_plugin_add(sopath) == -1) + break; + } + + closedir(d); + /* XXX this ignores failures from readdir() itself */ + return (dent == NULL) ? 0 : -1; +} + +int +eal_plugins_init(void) +{ + struct shared_driver *solib = NULL; + struct stat sb; + + if (*default_solib_dir != '\0' && stat(default_solib_dir, &sb) == 0 && + S_ISDIR(sb.st_mode)) + eal_plugin_add(default_solib_dir); + + TAILQ_FOREACH(solib, &solib_list, next) { + + if (stat(solib->name, &sb) == 0 && S_ISDIR(sb.st_mode)) { + if (eal_plugindir_init(solib->name) == -1) { + RTE_LOG(ERR, EAL, + "Cannot init plugin directory %s\n", + solib->name); + return -1; + } + } else { + RTE_LOG(DEBUG, EAL, "open shared lib %s\n", + solib->name); + solib->lib_handle = dlopen(solib->name, RTLD_NOW); + if (solib->lib_handle == NULL) { + RTE_LOG(ERR, EAL, "%s\n", dlerror()); + return -1; + } + } + + } + return 0; +} + +/* + * Parse the coremask given as argument (hexadecimal string) and fill + * the global configuration (core role and core count) with the parsed + * value. + */ +static int xdigit2val(unsigned char c) +{ + int val; + + if (isdigit(c)) + val = c - '0'; + else if (isupper(c)) + val = c - 'A' + 10; + else + val = c - 'a' + 10; + return val; +} + +static int +eal_parse_service_coremask(const char *coremask) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, j, idx = 0; + unsigned int count = 0; + char c; + int val; + uint32_t taken_lcore_count = 0; + + if (coremask == NULL) + return -1; + /* Remove all blank characters ahead and after . + * Remove 0x/0X if exists. + */ + while (isblank(*coremask)) + coremask++; + if (coremask[0] == '0' && ((coremask[1] == 'x') + || (coremask[1] == 'X'))) + coremask += 2; + i = strlen(coremask); + while ((i > 0) && isblank(coremask[i - 1])) + i--; + + if (i == 0) + return -1; + + for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE; i--) { + c = coremask[i]; + if (isxdigit(c) == 0) { + /* invalid characters */ + return -1; + } + val = xdigit2val(c); + for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE; + j++, idx++) { + if ((1 << j) & val) { + /* handle master lcore already parsed */ + uint32_t lcore = idx; + if (master_lcore_parsed && + cfg->master_lcore == lcore) { + RTE_LOG(ERR, EAL, + "lcore %u is master lcore, cannot use as service core\n", + idx); + return -1; + } + + if (!lcore_config[idx].detected) { + RTE_LOG(ERR, EAL, + "lcore %u unavailable\n", idx); + return -1; + } + + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + + lcore_config[idx].core_role = ROLE_SERVICE; + count++; + } + } + } + + for (; i >= 0; i--) + if (coremask[i] != '0') + return -1; + + for (; idx < RTE_MAX_LCORE; idx++) + lcore_config[idx].core_index = -1; + + if (count == 0) + return -1; + + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores are in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + + cfg->service_lcore_count = count; + return 0; +} + +static int +eal_service_cores_parsed(void) +{ + int idx; + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (lcore_config[idx].core_role == ROLE_SERVICE) + return 1; + } + return 0; +} + +static int +eal_parse_coremask(const char *coremask) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, j, idx = 0; + unsigned count = 0; + char c; + int val; + + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. " + "Please ensure -c is before -s or -S\n"); + + if (coremask == NULL) + return -1; + /* Remove all blank characters ahead and after . + * Remove 0x/0X if exists. + */ + while (isblank(*coremask)) + coremask++; + if (coremask[0] == '0' && ((coremask[1] == 'x') + || (coremask[1] == 'X'))) + coremask += 2; + i = strlen(coremask); + while ((i > 0) && isblank(coremask[i - 1])) + i--; + if (i == 0) + return -1; + + for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE; i--) { + c = coremask[i]; + if (isxdigit(c) == 0) { + /* invalid characters */ + return -1; + } + val = xdigit2val(c); + for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE; j++, idx++) + { + if ((1 << j) & val) { + if (!lcore_config[idx].detected) { + RTE_LOG(ERR, EAL, "lcore %u " + "unavailable\n", idx); + return -1; + } + + cfg->lcore_role[idx] = ROLE_RTE; + lcore_config[idx].core_index = count; + count++; + } else { + cfg->lcore_role[idx] = ROLE_OFF; + lcore_config[idx].core_index = -1; + } + } + } + for (; i >= 0; i--) + if (coremask[i] != '0') + return -1; + for (; idx < RTE_MAX_LCORE; idx++) { + cfg->lcore_role[idx] = ROLE_OFF; + lcore_config[idx].core_index = -1; + } + if (count == 0) + return -1; + /* Update the count of enabled logical cores of the EAL configuration */ + cfg->lcore_count = count; + return 0; +} + +static int +eal_parse_service_corelist(const char *corelist) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, idx = 0; + unsigned count = 0; + char *end = NULL; + int min, max; + uint32_t taken_lcore_count = 0; + + if (corelist == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*corelist)) + corelist++; + i = strlen(corelist); + while ((i > 0) && isblank(corelist[i - 1])) + i--; + + /* Get list of cores */ + min = RTE_MAX_LCORE; + do { + while (isblank(*corelist)) + corelist++; + if (*corelist == '\0') + return -1; + errno = 0; + idx = strtoul(corelist, &end, 10); + if (errno || end == NULL) + return -1; + while (isblank(*end)) + end++; + if (*end == '-') { + min = idx; + } else if ((*end == ',') || (*end == '\0')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = min; idx <= max; idx++) { + if (cfg->lcore_role[idx] != ROLE_SERVICE) { + /* handle master lcore already parsed */ + uint32_t lcore = idx; + if (cfg->master_lcore == lcore && + master_lcore_parsed) { + RTE_LOG(ERR, EAL, + "Error: lcore %u is master lcore, cannot use as service core\n", + idx); + return -1; + } + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + + lcore_config[idx].core_role = + ROLE_SERVICE; + count++; + } + } + min = RTE_MAX_LCORE; + } else + return -1; + corelist = end + 1; + } while (*end != '\0'); + + if (count == 0) + return -1; + + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores were in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + + return 0; +} + +static int +eal_parse_corelist(const char *corelist) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, idx = 0; + unsigned count = 0; + char *end = NULL; + int min, max; + + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. " + "Please ensure -l is before -s or -S\n"); + + if (corelist == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*corelist)) + corelist++; + i = strlen(corelist); + while ((i > 0) && isblank(corelist[i - 1])) + i--; + + /* Reset config */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + cfg->lcore_role[idx] = ROLE_OFF; + lcore_config[idx].core_index = -1; + } + + /* Get list of cores */ + min = RTE_MAX_LCORE; + do { + while (isblank(*corelist)) + corelist++; + if (*corelist == '\0') + return -1; + errno = 0; + idx = strtoul(corelist, &end, 10); + if (errno || end == NULL) + return -1; + while (isblank(*end)) + end++; + if (*end == '-') { + min = idx; + } else if ((*end == ',') || (*end == '\0')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = min; idx <= max; idx++) { + if (cfg->lcore_role[idx] != ROLE_RTE) { + cfg->lcore_role[idx] = ROLE_RTE; + lcore_config[idx].core_index = count; + count++; + } + } + min = RTE_MAX_LCORE; + } else + return -1; + corelist = end + 1; + } while (*end != '\0'); + + if (count == 0) + return -1; + + /* Update the count of enabled logical cores of the EAL configuration */ + cfg->lcore_count = count; + + return 0; +} + +/* Changes the lcore id of the master thread */ +static int +eal_parse_master_lcore(const char *arg) +{ + char *parsing_end; + struct rte_config *cfg = rte_eal_get_configuration(); + + errno = 0; + cfg->master_lcore = (uint32_t) strtol(arg, &parsing_end, 0); + if (errno || parsing_end[0] != 0) + return -1; + if (cfg->master_lcore >= RTE_MAX_LCORE) + return -1; + master_lcore_parsed = 1; + + /* ensure master core is not used as service core */ + if (lcore_config[cfg->master_lcore].core_role == ROLE_SERVICE) { + RTE_LOG(ERR, EAL, + "Error: Master lcore is used as a service core\n"); + return -1; + } + + return 0; +} + +/* + * Parse elem, the elem could be single number/range or '(' ')' group + * 1) A single number elem, it's just a simple digit. e.g. 9 + * 2) A single range elem, two digits with a '-' between. e.g. 2-6 + * 3) A group elem, combines multiple 1) or 2) with '( )'. e.g (0,2-4,6) + * Within group elem, '-' used for a range separator; + * ',' used for a single number. + */ +static int +eal_parse_set(const char *input, uint16_t set[], unsigned num) +{ + unsigned idx; + const char *str = input; + char *end = NULL; + unsigned min, max; + + memset(set, 0, num * sizeof(uint16_t)); + + while (isblank(*str)) + str++; + + /* only digit or left bracket is qualify for start point */ + if ((!isdigit(*str) && *str != '(') || *str == '\0') + return -1; + + /* process single number or single range of number */ + if (*str != '(') { + errno = 0; + idx = strtoul(str, &end, 10); + if (errno || end == NULL || idx >= num) + return -1; + else { + while (isblank(*end)) + end++; + + min = idx; + max = idx; + if (*end == '-') { + /* process single - */ + end++; + while (isblank(*end)) + end++; + if (!isdigit(*end)) + return -1; + + errno = 0; + idx = strtoul(end, &end, 10); + if (errno || end == NULL || idx >= num) + return -1; + max = idx; + while (isblank(*end)) + end++; + if (*end != ',' && *end != '\0') + return -1; + } + + if (*end != ',' && *end != '\0' && + *end != '@') + return -1; + + for (idx = RTE_MIN(min, max); + idx <= RTE_MAX(min, max); idx++) + set[idx] = 1; + + return end - input; + } + } + + /* process set within bracket */ + str++; + while (isblank(*str)) + str++; + if (*str == '\0') + return -1; + + min = RTE_MAX_LCORE; + do { + + /* go ahead to the first digit */ + while (isblank(*str)) + str++; + if (!isdigit(*str)) + return -1; + + /* get the digit value */ + errno = 0; + idx = strtoul(str, &end, 10); + if (errno || end == NULL || idx >= num) + return -1; + + /* go ahead to separator '-',',' and ')' */ + while (isblank(*end)) + end++; + if (*end == '-') { + if (min == RTE_MAX_LCORE) + min = idx; + else /* avoid continuous '-' */ + return -1; + } else if ((*end == ',') || (*end == ')')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = RTE_MIN(min, max); + idx <= RTE_MAX(min, max); idx++) + set[idx] = 1; + + min = RTE_MAX_LCORE; + } else + return -1; + + str = end + 1; + } while (*end != '\0' && *end != ')'); + + /* + * to avoid failure that tail blank makes end character check fail + * in eal_parse_lcores( ) + */ + while (isblank(*str)) + str++; + + return str - input; +} + +/* convert from set array to cpuset bitmap */ +static int +convert_to_cpuset(rte_cpuset_t *cpusetp, + uint16_t *set, unsigned num) +{ + unsigned idx; + + CPU_ZERO(cpusetp); + + for (idx = 0; idx < num; idx++) { + if (!set[idx]) + continue; + + if (!lcore_config[idx].detected) { + RTE_LOG(ERR, EAL, "core %u " + "unavailable\n", idx); + return -1; + } + + CPU_SET(idx, cpusetp); + } + + return 0; +} + +/* + * The format pattern: --lcores='[<,lcores[@cpus]>...]' + * lcores, cpus could be a single digit/range or a group. + * '(' and ')' are necessary if it's a group. + * If not supply '@cpus', the value of cpus uses the same as lcores. + * e.g. '1,2@(5-7),(3-5)@(0,2),(0,6),7-8' means start 9 EAL thread as below + * lcore 0 runs on cpuset 0x41 (cpu 0,6) + * lcore 1 runs on cpuset 0x2 (cpu 1) + * lcore 2 runs on cpuset 0xe0 (cpu 5,6,7) + * lcore 3,4,5 runs on cpuset 0x5 (cpu 0,2) + * lcore 6 runs on cpuset 0x41 (cpu 0,6) + * lcore 7 runs on cpuset 0x80 (cpu 7) + * lcore 8 runs on cpuset 0x100 (cpu 8) + */ +static int +eal_parse_lcores(const char *lcores) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + static uint16_t set[RTE_MAX_LCORE]; + unsigned idx = 0; + unsigned count = 0; + const char *lcore_start = NULL; + const char *end = NULL; + int offset; + rte_cpuset_t cpuset; + int lflags; + int ret = -1; + + if (lcores == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*lcores)) + lcores++; + + CPU_ZERO(&cpuset); + + /* Reset lcore config */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + cfg->lcore_role[idx] = ROLE_OFF; + lcore_config[idx].core_index = -1; + CPU_ZERO(&lcore_config[idx].cpuset); + } + + /* Get list of cores */ + do { + while (isblank(*lcores)) + lcores++; + if (*lcores == '\0') + goto err; + + lflags = 0; + + /* record lcore_set start point */ + lcore_start = lcores; + + /* go across a complete bracket */ + if (*lcore_start == '(') { + lcores += strcspn(lcores, ")"); + if (*lcores++ == '\0') + goto err; + } + + /* scan the separator '@', ','(next) or '\0'(finish) */ + lcores += strcspn(lcores, "@,"); + + if (*lcores == '@') { + /* explicit assign cpu_set */ + offset = eal_parse_set(lcores + 1, set, RTE_DIM(set)); + if (offset < 0) + goto err; + + /* prepare cpu_set and update the end cursor */ + if (0 > convert_to_cpuset(&cpuset, + set, RTE_DIM(set))) + goto err; + end = lcores + 1 + offset; + } else { /* ',' or '\0' */ + /* haven't given cpu_set, current loop done */ + end = lcores; + + /* go back to check - */ + offset = strcspn(lcore_start, "(-"); + if (offset < (end - lcore_start) && + *(lcore_start + offset) != '(') + lflags = 1; + } + + if (*end != ',' && *end != '\0') + goto err; + + /* parse lcore_set from start point */ + if (0 > eal_parse_set(lcore_start, set, RTE_DIM(set))) + goto err; + + /* without '@', by default using lcore_set as cpu_set */ + if (*lcores != '@' && + 0 > convert_to_cpuset(&cpuset, set, RTE_DIM(set))) + goto err; + + /* start to update lcore_set */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (!set[idx]) + continue; + + if (cfg->lcore_role[idx] != ROLE_RTE) { + lcore_config[idx].core_index = count; + cfg->lcore_role[idx] = ROLE_RTE; + count++; + } + + if (lflags) { + CPU_ZERO(&cpuset); + CPU_SET(idx, &cpuset); + } + rte_memcpy(&lcore_config[idx].cpuset, &cpuset, + sizeof(rte_cpuset_t)); + } + + lcores = end + 1; + } while (*end != '\0'); + + if (count == 0) + goto err; + + cfg->lcore_count = count; + ret = 0; + +err: + + return ret; +} + +static int +eal_parse_syslog(const char *facility, struct internal_config *conf) +{ + int i; + static const struct { + const char *name; + int value; + } map[] = { + { "auth", LOG_AUTH }, + { "cron", LOG_CRON }, + { "daemon", LOG_DAEMON }, + { "ftp", LOG_FTP }, + { "kern", LOG_KERN }, + { "lpr", LOG_LPR }, + { "mail", LOG_MAIL }, + { "news", LOG_NEWS }, + { "syslog", LOG_SYSLOG }, + { "user", LOG_USER }, + { "uucp", LOG_UUCP }, + { "local0", LOG_LOCAL0 }, + { "local1", LOG_LOCAL1 }, + { "local2", LOG_LOCAL2 }, + { "local3", LOG_LOCAL3 }, + { "local4", LOG_LOCAL4 }, + { "local5", LOG_LOCAL5 }, + { "local6", LOG_LOCAL6 }, + { "local7", LOG_LOCAL7 }, + { NULL, 0 } + }; + + for (i = 0; map[i].name; i++) { + if (!strcmp(facility, map[i].name)) { + conf->syslog_facility = map[i].value; + return 0; + } + } + return -1; +} + +static int +eal_parse_log_priority(const char *level) +{ + static const char * const levels[] = { + [RTE_LOG_EMERG] = "emergency", + [RTE_LOG_ALERT] = "alert", + [RTE_LOG_CRIT] = "critical", + [RTE_LOG_ERR] = "error", + [RTE_LOG_WARNING] = "warning", + [RTE_LOG_NOTICE] = "notice", + [RTE_LOG_INFO] = "info", + [RTE_LOG_DEBUG] = "debug", + }; + size_t len = strlen(level); + unsigned long tmp; + char *end; + unsigned int i; + + if (len == 0) + return -1; + + /* look for named values, skip 0 which is not a valid level */ + for (i = 1; i < RTE_DIM(levels); i++) { + if (strncmp(levels[i], level, len) == 0) + return i; + } + + /* not a string, maybe it is numeric */ + errno = 0; + tmp = strtoul(level, &end, 0); + + /* check for errors */ + if (errno != 0 || end == NULL || *end != '\0' || + tmp >= UINT32_MAX) + return -1; + + return tmp; +} + +static int +eal_parse_log_level(const char *arg) +{ + const char *pattern = NULL; + const char *regex = NULL; + char *str, *level; + int priority; + + str = strdup(arg); + if (str == NULL) + return -1; + + if ((level = strchr(str, ','))) { + regex = str; + *level++ = '\0'; + } else if ((level = strchr(str, ':'))) { + pattern = str; + *level++ = '\0'; + } else { + level = str; + } + + priority = eal_parse_log_priority(level); + if (priority < 0) { + fprintf(stderr, "invalid log priority: %s\n", level); + goto fail; + } + + if (regex) { + if (rte_log_set_level_regexp(regex, priority) < 0) { + fprintf(stderr, "cannot set log level %s,%d\n", + pattern, priority); + goto fail; + } + if (rte_log_save_regexp(regex, priority) < 0) + goto fail; + } else if (pattern) { + if (rte_log_set_level_pattern(pattern, priority) < 0) { + fprintf(stderr, "cannot set log level %s:%d\n", + pattern, priority); + goto fail; + } + if (rte_log_save_pattern(pattern, priority) < 0) + goto fail; + } else { + rte_log_set_global_level(priority); + } + + free(str); + return 0; + +fail: + free(str); + return -1; +} + +static enum rte_proc_type_t +eal_parse_proc_type(const char *arg) +{ + if (strncasecmp(arg, "primary", sizeof("primary")) == 0) + return RTE_PROC_PRIMARY; + if (strncasecmp(arg, "secondary", sizeof("secondary")) == 0) + return RTE_PROC_SECONDARY; + if (strncasecmp(arg, "auto", sizeof("auto")) == 0) + return RTE_PROC_AUTO; + + return RTE_PROC_INVALID; +} + +int +eal_parse_common_option(int opt, const char *optarg, + struct internal_config *conf) +{ + static int b_used; + static int w_used; + + switch (opt) { + /* blacklist */ + case 'b': + if (w_used) + goto bw_used; + if (eal_option_device_add(RTE_DEVTYPE_BLACKLISTED_PCI, + optarg) < 0) { + return -1; + } + b_used = 1; + break; + /* whitelist */ + case 'w': + if (b_used) + goto bw_used; + if (eal_option_device_add(RTE_DEVTYPE_WHITELISTED_PCI, + optarg) < 0) { + return -1; + } + w_used = 1; + break; + /* coremask */ + case 'c': + if (eal_parse_coremask(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid coremask\n"); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option -c is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_LST) ? "-l" : + (core_parsed == LCORE_OPT_MAP) ? "--lcore" : + "-c"); + return -1; + } + + core_parsed = LCORE_OPT_MSK; + break; + /* corelist */ + case 'l': + if (eal_parse_corelist(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid core list\n"); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option -l is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_MSK) ? "-c" : + (core_parsed == LCORE_OPT_MAP) ? "--lcore" : + "-l"); + return -1; + } + + core_parsed = LCORE_OPT_LST; + break; + /* service coremask */ + case 's': + if (eal_parse_service_coremask(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid service coremask\n"); + return -1; + } + break; + /* service corelist */ + case 'S': + if (eal_parse_service_corelist(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid service core list\n"); + return -1; + } + break; + /* size of memory */ + case 'm': + conf->memory = atoi(optarg); + conf->memory *= 1024ULL; + conf->memory *= 1024ULL; + mem_parsed = 1; + break; + /* force number of channels */ + case 'n': + conf->force_nchannel = atoi(optarg); + if (conf->force_nchannel == 0) { + RTE_LOG(ERR, EAL, "invalid channel number\n"); + return -1; + } + break; + /* force number of ranks */ + case 'r': + conf->force_nrank = atoi(optarg); + if (conf->force_nrank == 0 || + conf->force_nrank > 16) { + RTE_LOG(ERR, EAL, "invalid rank number\n"); + return -1; + } + break; + /* force loading of external driver */ + case 'd': + if (eal_plugin_add(optarg) == -1) + return -1; + break; + case 'v': + /* since message is explicitly requested by user, we + * write message at highest log level so it can always + * be seen + * even if info or warning messages are disabled */ + RTE_LOG(CRIT, EAL, "RTE Version: '%s'\n", rte_version()); + break; + + /* long options */ + case OPT_HUGE_UNLINK_NUM: + conf->hugepage_unlink = 1; + break; + + case OPT_NO_HUGE_NUM: + conf->no_hugetlbfs = 1; + /* no-huge is legacy mem */ + conf->legacy_mem = 1; + break; + + case OPT_NO_PCI_NUM: + conf->no_pci = 1; + break; + + case OPT_NO_HPET_NUM: + conf->no_hpet = 1; + break; + + case OPT_VMWARE_TSC_MAP_NUM: + conf->vmware_tsc_map = 1; + break; + + case OPT_NO_SHCONF_NUM: + conf->no_shconf = 1; + break; + + case OPT_IN_MEMORY_NUM: + conf->in_memory = 1; + /* in-memory is a superset of noshconf and huge-unlink */ + conf->no_shconf = 1; + conf->hugepage_unlink = 1; + break; + + case OPT_PROC_TYPE_NUM: + conf->process_type = eal_parse_proc_type(optarg); + break; + + case OPT_MASTER_LCORE_NUM: + if (eal_parse_master_lcore(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_MASTER_LCORE "\n"); + return -1; + } + break; + + case OPT_VDEV_NUM: + if (eal_option_device_add(RTE_DEVTYPE_VIRTUAL, + optarg) < 0) { + return -1; + } + break; + + case OPT_SYSLOG_NUM: + if (eal_parse_syslog(optarg, conf) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SYSLOG "\n"); + return -1; + } + break; + + case OPT_LOG_LEVEL_NUM: { + if (eal_parse_log_level(optarg) < 0) { + RTE_LOG(ERR, EAL, + "invalid parameters for --" + OPT_LOG_LEVEL "\n"); + return -1; + } + break; + } + case OPT_LCORES_NUM: + if (eal_parse_lcores(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_LCORES "\n"); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option --lcore is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_LST) ? "-l" : + (core_parsed == LCORE_OPT_MSK) ? "-c" : + "--lcore"); + return -1; + } + + core_parsed = LCORE_OPT_MAP; + break; + case OPT_LEGACY_MEM_NUM: + conf->legacy_mem = 1; + break; + case OPT_SINGLE_FILE_SEGMENTS_NUM: + conf->single_file_segments = 1; + break; + + /* don't know what to do, leave this to caller */ + default: + return 1; + + } + + return 0; +bw_used: + RTE_LOG(ERR, EAL, "Options blacklist (-b) and whitelist (-w) " + "cannot be used at the same time\n"); + return -1; +} + +static void +eal_auto_detect_cores(struct rte_config *cfg) +{ + unsigned int lcore_id; + unsigned int removed = 0; + rte_cpuset_t affinity_set; + pthread_t tid = pthread_self(); + + if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), + &affinity_set) < 0) + CPU_ZERO(&affinity_set); + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (cfg->lcore_role[lcore_id] == ROLE_RTE && + !CPU_ISSET(lcore_id, &affinity_set)) { + cfg->lcore_role[lcore_id] = ROLE_OFF; + removed++; + } + } + + cfg->lcore_count -= removed; +} + +int +eal_adjust_config(struct internal_config *internal_cfg) +{ + int i; + struct rte_config *cfg = rte_eal_get_configuration(); + + if (!core_parsed) + eal_auto_detect_cores(cfg); + + if (internal_config.process_type == RTE_PROC_AUTO) + internal_config.process_type = eal_proc_type_detect(); + + /* default master lcore is the first one */ + if (!master_lcore_parsed) { + cfg->master_lcore = rte_get_next_lcore(-1, 0, 0); + lcore_config[cfg->master_lcore].core_role = ROLE_RTE; + } + + /* if no memory amounts were requested, this will result in 0 and + * will be overridden later, right after eal_hugepage_info_init() */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->memory += internal_cfg->socket_mem[i]; + + return 0; +} + +int +eal_check_common_options(struct internal_config *internal_cfg) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (cfg->lcore_role[cfg->master_lcore] != ROLE_RTE) { + RTE_LOG(ERR, EAL, "Master lcore is not enabled for DPDK\n"); + return -1; + } + + if (internal_cfg->process_type == RTE_PROC_INVALID) { + RTE_LOG(ERR, EAL, "Invalid process type specified\n"); + return -1; + } + if (index(internal_cfg->hugefile_prefix, '%') != NULL) { + RTE_LOG(ERR, EAL, "Invalid char, '%%', in --"OPT_FILE_PREFIX" " + "option\n"); + return -1; + } + if (mem_parsed && internal_cfg->force_sockets == 1) { + RTE_LOG(ERR, EAL, "Options -m and --"OPT_SOCKET_MEM" cannot " + "be specified at the same time\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->force_sockets == 1) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_MEM" cannot " + "be specified together with --"OPT_NO_HUGE"\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot " + "be specified together with --"OPT_NO_HUGE"\n"); + return -1; + } + if (internal_config.force_socket_limits && internal_config.legacy_mem) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_LIMIT + " is only supported in non-legacy memory mode\n"); + } + if (internal_cfg->single_file_segments && + internal_cfg->hugepage_unlink) { + RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " + "not compatible with neither --"OPT_IN_MEMORY" nor " + "--"OPT_HUGE_UNLINK"\n"); + return -1; + } + + return 0; +} + +void +eal_common_usage(void) +{ + printf("[options]\n\n" + "EAL common options:\n" + " -c COREMASK Hexadecimal bitmask of cores to run on\n" + " -l CORELIST List of cores to run on\n" + " The argument format is [-c2][,c3[-c4],...]\n" + " where c1, c2, etc are core indexes between 0 and %d\n" + " --"OPT_LCORES" COREMAP Map lcore set to physical cpu set\n" + " The argument format is\n" + " '[<,lcores[@cpus]>...]'\n" + " lcores and cpus list are grouped by '(' and ')'\n" + " Within the group, '-' is used for range separator,\n" + " ',' is used for single number separator.\n" + " '( )' can be omitted for single element group,\n" + " '@' can be omitted if cpus and lcores have the same value\n" + " -s SERVICE COREMASK Hexadecimal bitmask of cores to be used as service cores\n" + " --"OPT_MASTER_LCORE" ID Core ID that is used as master\n" + " --"OPT_MBUF_POOL_OPS_NAME" Pool ops name for mbuf to use\n" + " -n CHANNELS Number of memory channels\n" + " -m MB Memory to allocate (see also --"OPT_SOCKET_MEM")\n" + " -r RANKS Force number of memory ranks (don't detect)\n" + " -b, --"OPT_PCI_BLACKLIST" Add a PCI device in black list.\n" + " Prevent EAL from using this PCI device. The argument\n" + " format is .\n" + " -w, --"OPT_PCI_WHITELIST" Add a PCI device in white list.\n" + " Only use the specified PCI devices. The argument format\n" + " is <[domain:]bus:devid.func>. This option can be present\n" + " several times (once per device).\n" + " [NOTE: PCI whitelist cannot be used with -b option]\n" + " --"OPT_VDEV" Add a virtual device.\n" + " The argument format is [,key=val,...]\n" + " (ex: --vdev=net_pcap0,iface=eth2).\n" + " -d LIB.so|DIR Add a driver or driver directory\n" + " (can be used multiple times)\n" + " --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n" + " --"OPT_PROC_TYPE" Type of this process (primary|secondary|auto)\n" + " --"OPT_SYSLOG" Set syslog facility\n" + " --"OPT_LOG_LEVEL"= Set global log level\n" + " --"OPT_LOG_LEVEL"=:\n" + " Set specific log level\n" + " -v Display version information on startup\n" + " -h, --help This help\n" + " --"OPT_IN_MEMORY" Operate entirely in memory. This will\n" + " disable secondary process support\n" + "\nEAL options for DEBUG use only:\n" + " --"OPT_HUGE_UNLINK" Unlink hugepage files after init\n" + " --"OPT_NO_HUGE" Use malloc instead of hugetlbfs\n" + " --"OPT_NO_PCI" Disable PCI\n" + " --"OPT_NO_HPET" Disable HPET\n" + " --"OPT_NO_SHCONF" No shared config (mmap'd files)\n" + "\n", RTE_MAX_LCORE); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c new file mode 100644 index 00000000..9fcb9121 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c @@ -0,0 +1,1181 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" + +static int mp_fd = -1; +static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */ +static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */ +static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER; + +struct action_entry { + TAILQ_ENTRY(action_entry) next; + char action_name[RTE_MP_MAX_NAME_LEN]; + rte_mp_t action; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(action_entry_list, action_entry); + +static struct action_entry_list action_entry_list = + TAILQ_HEAD_INITIALIZER(action_entry_list); + +enum mp_type { + MP_MSG, /* Share message with peers, will not block */ + MP_REQ, /* Request for information, Will block for a reply */ + MP_REP, /* Response to previously-received request */ + MP_IGN, /* Response telling requester to ignore this response */ +}; + +struct mp_msg_internal { + int type; + struct rte_mp_msg msg; +}; + +struct async_request_param { + rte_mp_async_reply_t clb; + struct rte_mp_reply user_reply; + struct timespec end; + int n_responses_processed; +}; + +struct pending_request { + TAILQ_ENTRY(pending_request) next; + enum { + REQUEST_TYPE_SYNC, + REQUEST_TYPE_ASYNC + } type; + char dst[PATH_MAX]; + struct rte_mp_msg *request; + struct rte_mp_msg *reply; + int reply_received; + RTE_STD_C11 + union { + struct { + struct async_request_param *param; + } async; + struct { + pthread_cond_t cond; + } sync; + }; +}; + +TAILQ_HEAD(pending_request_list, pending_request); + +static struct { + struct pending_request_list requests; + pthread_mutex_t lock; +} pending_requests = { + .requests = TAILQ_HEAD_INITIALIZER(pending_requests.requests), + .lock = PTHREAD_MUTEX_INITIALIZER, + /**< used in async requests only */ +}; + +/* forward declarations */ +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type); + +/* for use with alarm callback */ +static void +async_reply_handle(void *arg); + +/* for use with process_msg */ +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg); + +static void +trigger_async_action(struct pending_request *req); + +static struct pending_request * +find_pending_request(const char *dst, const char *act_name) +{ + struct pending_request *r; + + TAILQ_FOREACH(r, &pending_requests.requests, next) { + if (!strcmp(r->dst, dst) && + !strcmp(r->request->name, act_name)) + break; + } + + return r; +} + +static void +create_socket_path(const char *name, char *buf, int len) +{ + const char *prefix = eal_mp_socket_path(); + + if (strlen(name) > 0) + snprintf(buf, len, "%s_%s", prefix, name); + else + strlcpy(buf, prefix, len); +} + +int +rte_eal_primary_proc_alive(const char *config_file_path) +{ + int config_fd; + + if (config_file_path) + config_fd = open(config_file_path, O_RDONLY); + else { + const char *path; + + path = eal_runtime_config_path(); + config_fd = open(path, O_RDONLY); + } + if (config_fd < 0) + return 0; + + int ret = lockf(config_fd, F_TEST, 0); + close(config_fd); + + return !!ret; +} + +static struct action_entry * +find_action_entry_by_name(const char *name) +{ + struct action_entry *entry; + + TAILQ_FOREACH(entry, &action_entry_list, next) { + if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0) + break; + } + + return entry; +} + +static int +validate_action_name(const char *name) +{ + if (name == NULL) { + RTE_LOG(ERR, EAL, "Action name cannot be NULL\n"); + rte_errno = EINVAL; + return -1; + } + if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) { + RTE_LOG(ERR, EAL, "Length of action name is zero\n"); + rte_errno = EINVAL; + return -1; + } + if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) { + rte_errno = E2BIG; + return -1; + } + return 0; +} + +int __rte_experimental +rte_mp_action_register(const char *name, rte_mp_t action) +{ + struct action_entry *entry; + + if (validate_action_name(name)) + return -1; + + entry = malloc(sizeof(struct action_entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + return -1; + } + strlcpy(entry->action_name, name, sizeof(entry->action_name)); + entry->action = action; + + pthread_mutex_lock(&mp_mutex_action); + if (find_action_entry_by_name(name) != NULL) { + pthread_mutex_unlock(&mp_mutex_action); + rte_errno = EEXIST; + free(entry); + return -1; + } + TAILQ_INSERT_TAIL(&action_entry_list, entry, next); + pthread_mutex_unlock(&mp_mutex_action); + return 0; +} + +void __rte_experimental +rte_mp_action_unregister(const char *name) +{ + struct action_entry *entry; + + if (validate_action_name(name)) + return; + + pthread_mutex_lock(&mp_mutex_action); + entry = find_action_entry_by_name(name); + if (entry == NULL) { + pthread_mutex_unlock(&mp_mutex_action); + return; + } + TAILQ_REMOVE(&action_entry_list, entry, next); + pthread_mutex_unlock(&mp_mutex_action); + free(entry); +} + +static int +read_msg(struct mp_msg_internal *m, struct sockaddr_un *s) +{ + int msglen; + struct iovec iov; + struct msghdr msgh; + char control[CMSG_SPACE(sizeof(m->msg.fds))]; + struct cmsghdr *cmsg; + int buflen = sizeof(*m) - sizeof(m->msg.fds); + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = m; + iov.iov_len = buflen; + + msgh.msg_name = s; + msgh.msg_namelen = sizeof(*s); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + msglen = recvmsg(mp_fd, &msgh, 0); + if (msglen < 0) { + RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno)); + return -1; + } + + if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + RTE_LOG(ERR, EAL, "truncted msg\n"); + return -1; + } + + /* read auxiliary FDs if any */ + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds)); + break; + } + } + + return 0; +} + +static void +process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) +{ + struct pending_request *pending_req; + struct action_entry *entry; + struct rte_mp_msg *msg = &m->msg; + rte_mp_t action = NULL; + + RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name); + + if (m->type == MP_REP || m->type == MP_IGN) { + struct pending_request *req = NULL; + + pthread_mutex_lock(&pending_requests.lock); + pending_req = find_pending_request(s->sun_path, msg->name); + if (pending_req) { + memcpy(pending_req->reply, msg, sizeof(*msg)); + /* -1 indicates that we've been asked to ignore */ + pending_req->reply_received = + m->type == MP_REP ? 1 : -1; + + if (pending_req->type == REQUEST_TYPE_SYNC) + pthread_cond_signal(&pending_req->sync.cond); + else if (pending_req->type == REQUEST_TYPE_ASYNC) + req = async_reply_handle_thread_unsafe( + pending_req); + } else + RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name); + pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); + return; + } + + pthread_mutex_lock(&mp_mutex_action); + entry = find_action_entry_by_name(msg->name); + if (entry != NULL) + action = entry->action; + pthread_mutex_unlock(&mp_mutex_action); + + if (!action) { + if (m->type == MP_REQ && !internal_config.init_complete) { + /* if this is a request, and init is not yet complete, + * and callback wasn't registered, we should tell the + * requester to ignore our existence because we're not + * yet ready to process this request. + */ + struct rte_mp_msg dummy; + + memset(&dummy, 0, sizeof(dummy)); + strlcpy(dummy.name, msg->name, sizeof(dummy.name)); + mp_send(&dummy, s->sun_path, MP_IGN); + } else { + RTE_LOG(ERR, EAL, "Cannot find action: %s\n", + msg->name); + } + } else if (action(msg, s->sun_path) < 0) { + RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name); + } +} + +static void * +mp_handle(void *arg __rte_unused) +{ + struct mp_msg_internal msg; + struct sockaddr_un sa; + + while (1) { + if (read_msg(&msg, &sa) == 0) + process_msg(&msg, &sa); + } + + return NULL; +} + +static int +timespec_cmp(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec < b->tv_sec) + return -1; + if (a->tv_sec > b->tv_sec) + return 1; + if (a->tv_nsec < b->tv_nsec) + return -1; + if (a->tv_nsec > b->tv_nsec) + return 1; + return 0; +} + +enum async_action { + ACTION_FREE, /**< free the action entry, but don't trigger callback */ + ACTION_TRIGGER /**< trigger callback, then free action entry */ +}; + +static enum async_action +process_async_request(struct pending_request *sr, const struct timespec *now) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + bool timeout, last_msg; + + param = sr->async.param; + reply = ¶m->user_reply; + + /* did we timeout? */ + timeout = timespec_cmp(¶m->end, now) <= 0; + + /* if we received a response, adjust relevant data and copy mesasge. */ + if (sr->reply_received == 1 && sr->reply) { + struct rte_mp_msg *msg, *user_msgs, *tmp; + + msg = sr->reply; + user_msgs = reply->msgs; + + tmp = realloc(user_msgs, sizeof(*msg) * + (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + sr->dst, sr->request->name); + /* this entry is going to be removed and its message + * dropped, but we don't want to leak memory, so + * continue. + */ + } else { + user_msgs = tmp; + reply->msgs = user_msgs; + memcpy(&user_msgs[reply->nb_received], + msg, sizeof(*msg)); + reply->nb_received++; + } + + /* mark this request as processed */ + param->n_responses_processed++; + } else if (sr->reply_received == -1) { + /* we were asked to ignore this process */ + reply->nb_sent--; + } else if (timeout) { + /* count it as processed response, but don't increment + * nb_received. + */ + param->n_responses_processed++; + } + + free(sr->reply); + + last_msg = param->n_responses_processed == reply->nb_sent; + + return last_msg ? ACTION_TRIGGER : ACTION_FREE; +} + +static void +trigger_async_action(struct pending_request *sr) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + + param = sr->async.param; + reply = ¶m->user_reply; + + param->clb(sr->request, reply); + + /* clean up */ + free(sr->async.param->user_reply.msgs); + free(sr->async.param); + free(sr->request); + free(sr); +} + +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg) +{ + struct pending_request *req = (struct pending_request *)arg; + enum async_action action; + struct timespec ts_now; + struct timeval now; + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto no_trigger; + } + ts_now.tv_nsec = now.tv_usec * 1000; + ts_now.tv_sec = now.tv_sec; + + action = process_async_request(req, &ts_now); + + TAILQ_REMOVE(&pending_requests.requests, req, next); + + if (rte_eal_alarm_cancel(async_reply_handle, req) < 0) { + /* if we failed to cancel the alarm because it's already in + * progress, don't proceed because otherwise we will end up + * handling the same message twice. + */ + if (rte_errno == EINPROGRESS) { + RTE_LOG(DEBUG, EAL, "Request handling is already in progress\n"); + goto no_trigger; + } + RTE_LOG(ERR, EAL, "Failed to cancel alarm\n"); + } + + if (action == ACTION_TRIGGER) + return req; +no_trigger: + free(req); + return NULL; +} + +static void +async_reply_handle(void *arg) +{ + struct pending_request *req; + + pthread_mutex_lock(&pending_requests.lock); + req = async_reply_handle_thread_unsafe(arg); + pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); +} + +static int +open_socket_fd(void) +{ + char peer_name[PATH_MAX] = {0}; + struct sockaddr_un un; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + snprintf(peer_name, sizeof(peer_name), + "%d_%"PRIx64, getpid(), rte_rdtsc()); + + mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (mp_fd < 0) { + RTE_LOG(ERR, EAL, "failed to create unix socket\n"); + return -1; + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + + create_socket_path(peer_name, un.sun_path, sizeof(un.sun_path)); + + unlink(un.sun_path); /* May still exist since last run */ + + if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + RTE_LOG(ERR, EAL, "failed to bind %s: %s\n", + un.sun_path, strerror(errno)); + close(mp_fd); + return -1; + } + + RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path); + return mp_fd; +} + +static int +unlink_sockets(const char *filter) +{ + int dir_fd; + DIR *mp_dir; + struct dirent *ent; + + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + return -1; + } + dir_fd = dirfd(mp_dir); + + while ((ent = readdir(mp_dir))) { + if (fnmatch(filter, ent->d_name, 0) == 0) + unlinkat(dir_fd, ent->d_name, 0); + } + + closedir(mp_dir); + return 0; +} + +int +rte_mp_channel_init(void) +{ + char path[PATH_MAX]; + int dir_fd; + pthread_t mp_handle_tid; + + /* in no shared files mode, we do not have secondary processes support, + * so no need to initialize IPC. + */ + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC will be disabled\n"); + return 0; + } + + /* create filter path */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_filter, basename(path), sizeof(mp_filter)); + + /* path may have been modified, so recreate it */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_dir_path, dirname(path), sizeof(mp_dir_path)); + + /* lock the directory */ + dir_fd = open(mp_dir_path, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "failed to open %s: %s\n", + mp_dir_path, strerror(errno)); + return -1; + } + + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "failed to lock %s: %s\n", + mp_dir_path, strerror(errno)); + close(dir_fd); + return -1; + } + + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + unlink_sockets(mp_filter)) { + RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n"); + close(dir_fd); + return -1; + } + + if (open_socket_fd() < 0) { + close(dir_fd); + return -1; + } + + if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle", + NULL, mp_handle, NULL) < 0) { + RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", + strerror(errno)); + close(mp_fd); + close(dir_fd); + mp_fd = -1; + return -1; + } + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + close(dir_fd); + + return 0; +} + +/** + * Return -1, as fail to send message and it's caused by the local side. + * Return 0, as fail to send message and it's caused by the remote side. + * Return 1, as succeed to send message. + * + */ +static int +send_msg(const char *dst_path, struct rte_mp_msg *msg, int type) +{ + int snd; + struct iovec iov; + struct msghdr msgh; + struct cmsghdr *cmsg; + struct sockaddr_un dst; + struct mp_msg_internal m; + int fd_size = msg->num_fds * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + + m.type = type; + memcpy(&m.msg, msg, sizeof(*msg)); + + memset(&dst, 0, sizeof(dst)); + dst.sun_family = AF_UNIX; + strlcpy(dst.sun_path, dst_path, sizeof(dst.sun_path)); + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = &m; + iov.iov_len = sizeof(m) - sizeof(msg->fds); + + msgh.msg_name = &dst; + msgh.msg_namelen = sizeof(dst); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), msg->fds, fd_size); + + do { + snd = sendmsg(mp_fd, &msgh, 0); + } while (snd < 0 && errno == EINTR); + + if (snd < 0) { + rte_errno = errno; + /* Check if it caused by peer process exits */ + if (errno == ECONNREFUSED && + rte_eal_process_type() == RTE_PROC_PRIMARY) { + unlink(dst_path); + return 0; + } + if (errno == ENOBUFS) { + RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n", + dst_path); + return 0; + } + RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n", + dst_path, strerror(errno)); + return -1; + } + + return 1; +} + +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type) +{ + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + + if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY)) + peer = eal_mp_socket_path(); + + if (peer) { + if (send_msg(peer, msg, type) < 0) + return -1; + else + return 0; + } + + /* broadcast to all secondary processes */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", + mp_dir_path); + rte_errno = errno; + return -1; + } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + closedir(mp_dir); + return -1; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + if (send_msg(path, msg, type) < 0) + ret = -1; + } + /* unlock the dir */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + return ret; +} + +static bool +check_input(const struct rte_mp_msg *msg) +{ + if (msg == NULL) { + RTE_LOG(ERR, EAL, "Msg cannot be NULL\n"); + rte_errno = EINVAL; + return false; + } + + if (validate_action_name(msg->name)) + return false; + + if (msg->len_param > RTE_MP_MAX_PARAM_LEN) { + RTE_LOG(ERR, EAL, "Message data is too long\n"); + rte_errno = E2BIG; + return false; + } + + if (msg->num_fds > RTE_MP_MAX_FD_NUM) { + RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", + RTE_MP_MAX_FD_NUM); + rte_errno = E2BIG; + return false; + } + + return true; +} + +int __rte_experimental +rte_mp_sendmsg(struct rte_mp_msg *msg) +{ + if (!check_input(msg)) + return -1; + + RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name); + return mp_send(msg, NULL, MP_MSG); +} + +static int +mp_request_async(const char *dst, struct rte_mp_msg *req, + struct async_request_param *param, const struct timespec *ts) +{ + struct rte_mp_msg *reply_msg; + struct pending_request *pending_req, *exist; + int ret; + + pending_req = calloc(1, sizeof(*pending_req)); + reply_msg = calloc(1, sizeof(*reply_msg)); + if (pending_req == NULL || reply_msg == NULL) { + RTE_LOG(ERR, EAL, "Could not allocate space for sync request\n"); + rte_errno = ENOMEM; + ret = -1; + goto fail; + } + + pending_req->type = REQUEST_TYPE_ASYNC; + strlcpy(pending_req->dst, dst, sizeof(pending_req->dst)); + pending_req->request = req; + pending_req->reply = reply_msg; + pending_req->async.param = param; + + /* queue already locked by caller */ + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + ret = -1; + goto fail; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + ret = -1; + goto fail; + } else if (ret == 0) { + ret = 0; + goto fail; + } + TAILQ_INSERT_TAIL(&pending_requests.requests, pending_req, next); + + param->user_reply.nb_sent++; + + if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000, + async_reply_handle, pending_req) < 0) { + RTE_LOG(ERR, EAL, "Fail to set alarm for request %s:%s\n", + dst, req->name); + rte_panic("Fix the above shit to properly free all memory\n"); + } + + return 0; +fail: + free(pending_req); + free(reply_msg); + return ret; +} + +static int +mp_request_sync(const char *dst, struct rte_mp_msg *req, + struct rte_mp_reply *reply, const struct timespec *ts) +{ + int ret; + struct rte_mp_msg msg, *tmp; + struct pending_request pending_req, *exist; + + pending_req.type = REQUEST_TYPE_SYNC; + pending_req.reply_received = 0; + strlcpy(pending_req.dst, dst, sizeof(pending_req.dst)); + pending_req.request = req; + pending_req.reply = &msg; + pthread_cond_init(&pending_req.sync.cond, NULL); + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + return -1; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + return -1; + } else if (ret == 0) + return 0; + + TAILQ_INSERT_TAIL(&pending_requests.requests, &pending_req, next); + + reply->nb_sent++; + + do { + ret = pthread_cond_timedwait(&pending_req.sync.cond, + &pending_requests.lock, ts); + } while (ret != 0 && ret != ETIMEDOUT); + + TAILQ_REMOVE(&pending_requests.requests, &pending_req, next); + + if (pending_req.reply_received == 0) { + RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n", + dst, req->name); + rte_errno = ETIMEDOUT; + return -1; + } + if (pending_req.reply_received == -1) { + RTE_LOG(DEBUG, EAL, "Asked to ignore response\n"); + /* not receiving this message is not an error, so decrement + * number of sent messages + */ + reply->nb_sent--; + return 0; + } + + tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + dst, req->name); + rte_errno = ENOMEM; + return -1; + } + memcpy(&tmp[reply->nb_received], &msg, sizeof(msg)); + reply->msgs = tmp; + reply->nb_received++; + return 0; +} + +int __rte_experimental +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, + const struct timespec *ts) +{ + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec end; + + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + if (check_input(req) == false) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Faile to get current time\n"); + rte_errno = errno; + return -1; + } + + end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end.tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + pthread_mutex_lock(&pending_requests.lock); + ret = mp_request_sync(eal_mp_socket_path(), req, reply, &end); + pthread_mutex_unlock(&pending_requests.lock); + return ret; + } + + /* for primary process, broadcast request, and collect reply 1 by 1 */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + return -1; + } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + closedir(mp_dir); + rte_errno = errno; + return -1; + } + + pthread_mutex_lock(&pending_requests.lock); + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + /* unlocks the mutex while waiting for response, + * locks on receive + */ + if (mp_request_sync(path, req, reply, &end)) + ret = -1; + } + pthread_mutex_unlock(&pending_requests.lock); + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + return ret; +} + +int __rte_experimental +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb) +{ + struct rte_mp_msg *copy; + struct pending_request *dummy; + struct async_request_param *param; + struct rte_mp_reply *reply; + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec *end; + bool dummy_used = false; + + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + if (check_input(req) == false) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Faile to get current time\n"); + rte_errno = errno; + return -1; + } + copy = calloc(1, sizeof(*copy)); + dummy = calloc(1, sizeof(*dummy)); + param = calloc(1, sizeof(*param)); + if (copy == NULL || dummy == NULL || param == NULL) { + RTE_LOG(ERR, EAL, "Failed to allocate memory for async reply\n"); + rte_errno = ENOMEM; + goto fail; + } + + /* copy message */ + memcpy(copy, req, sizeof(*copy)); + + param->n_responses_processed = 0; + param->clb = clb; + end = ¶m->end; + reply = ¶m->user_reply; + + end->tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end->tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + /* we have to lock the request queue here, as we will be adding a bunch + * of requests to the queue at once, and some of the replies may arrive + * before we add all of the requests to the queue. + */ + pthread_mutex_lock(&pending_requests.lock); + + /* we have to ensure that callback gets triggered even if we don't send + * anything, therefore earlier we have allocated a dummy request. fill + * it, and put it on the queue if we don't send any requests. + */ + dummy->type = REQUEST_TYPE_ASYNC; + dummy->request = copy; + dummy->reply = NULL; + dummy->async.param = param; + dummy->reply_received = 1; /* short-circuit the timeout */ + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + ret = mp_request_async(eal_mp_socket_path(), copy, param, ts); + + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_TAIL(&pending_requests.requests, dummy, + next); + dummy_used = true; + } + + pthread_mutex_unlock(&pending_requests.lock); + + /* if we couldn't send anything, clean up */ + if (ret != 0) + goto fail; + return 0; + } + + /* for primary process, broadcast request */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + goto unlock_fail; + } + dir_fd = dirfd(mp_dir); + + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + goto closedir_fail; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + if (mp_request_async(path, copy, param, ts)) + ret = -1; + } + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_HEAD(&pending_requests.requests, dummy, next); + dummy_used = true; + } + + /* finally, unlock the queue */ + pthread_mutex_unlock(&pending_requests.lock); + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + + /* if dummy was unused, free it */ + if (!dummy_used) + free(dummy); + + return ret; +closedir_fail: + closedir(mp_dir); +unlock_fail: + pthread_mutex_unlock(&pending_requests.lock); +fail: + free(dummy); + free(param); + free(copy); + return -1; +} + +int __rte_experimental +rte_mp_reply(struct rte_mp_msg *msg, const char *peer) +{ + RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name); + + if (check_input(msg) == false) + return -1; + + if (peer == NULL) { + RTE_LOG(ERR, EAL, "peer is not specified\n"); + rte_errno = EINVAL; + return -1; + } + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + + return mp_send(msg, peer, MP_REP); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c new file mode 100644 index 00000000..6ac5f828 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include + +#include + +/* split string into tokens */ +int +rte_strsplit(char *string, int stringlen, + char **tokens, int maxtokens, char delim) +{ + int i, tok = 0; + int tokstart = 1; /* first token is right at start of string */ + + if (string == NULL || tokens == NULL) + goto einval_error; + + for (i = 0; i < stringlen; i++) { + if (string[i] == '\0' || tok >= maxtokens) + break; + if (tokstart) { + tokstart = 0; + tokens[tok++] = &string[i]; + } + if (string[i] == delim) { + string[i] = '\0'; + tokstart = 1; + } + } + return tok; + +einval_error: + errno = EINVAL; + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c new file mode 100644 index 00000000..babd3b30 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +TAILQ_HEAD(rte_tailq_elem_head, rte_tailq_elem); +/* local tailq list */ +static struct rte_tailq_elem_head rte_tailq_elem_head = + TAILQ_HEAD_INITIALIZER(rte_tailq_elem_head); + +/* number of tailqs registered, -1 before call to rte_eal_tailqs_init */ +static int rte_tailqs_count = -1; + +struct rte_tailq_head * +rte_eal_tailq_lookup(const char *name) +{ + unsigned i; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (name == NULL) + return NULL; + + for (i = 0; i < RTE_MAX_TAILQ; i++) { + if (!strncmp(name, mcfg->tailq_head[i].name, + RTE_TAILQ_NAMESIZE-1)) + return &mcfg->tailq_head[i]; + } + + return NULL; +} + +void +rte_dump_tailq(FILE *f) +{ + struct rte_mem_config *mcfg; + unsigned i = 0; + + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_read_lock(&mcfg->qlock); + for (i = 0; i < RTE_MAX_TAILQ; i++) { + const struct rte_tailq_head *tailq = &mcfg->tailq_head[i]; + const struct rte_tailq_entry_head *head = &tailq->tailq_head; + + fprintf(f, "Tailq %u: qname:<%s>, tqh_first:%p, tqh_last:%p\n", + i, tailq->name, head->tqh_first, head->tqh_last); + } + rte_rwlock_read_unlock(&mcfg->qlock); +} + +static struct rte_tailq_head * +rte_eal_tailq_create(const char *name) +{ + struct rte_tailq_head *head = NULL; + + if (!rte_eal_tailq_lookup(name) && + (rte_tailqs_count + 1 < RTE_MAX_TAILQ)) { + struct rte_mem_config *mcfg; + + mcfg = rte_eal_get_configuration()->mem_config; + head = &mcfg->tailq_head[rte_tailqs_count]; + snprintf(head->name, sizeof(head->name) - 1, "%s", name); + TAILQ_INIT(&head->tailq_head); + rte_tailqs_count++; + } + + return head; +} + +/* local register, used to store "early" tailqs before rte_eal_init() and to + * ensure secondary process only registers tailqs once. */ +static int +rte_eal_tailq_local_register(struct rte_tailq_elem *t) +{ + struct rte_tailq_elem *temp; + + TAILQ_FOREACH(temp, &rte_tailq_elem_head, next) { + if (!strncmp(t->name, temp->name, sizeof(temp->name))) + return -1; + } + + TAILQ_INSERT_TAIL(&rte_tailq_elem_head, t, next); + return 0; +} + +static void +rte_eal_tailq_update(struct rte_tailq_elem *t) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* primary process is the only one that creates */ + t->head = rte_eal_tailq_create(t->name); + } else { + t->head = rte_eal_tailq_lookup(t->name); + } +} + +int +rte_eal_tailq_register(struct rte_tailq_elem *t) +{ + if (rte_eal_tailq_local_register(t) < 0) { + RTE_LOG(ERR, EAL, + "%s tailq is already registered\n", t->name); + goto error; + } + + /* if a register happens after rte_eal_tailqs_init(), then we can update + * tailq head */ + if (rte_tailqs_count >= 0) { + rte_eal_tailq_update(t); + if (t->head == NULL) { + RTE_LOG(ERR, EAL, + "Cannot initialize tailq: %s\n", t->name); + TAILQ_REMOVE(&rte_tailq_elem_head, t, next); + goto error; + } + } + + return 0; + +error: + t->head = NULL; + return -1; +} + +int +rte_eal_tailqs_init(void) +{ + struct rte_tailq_elem *t; + + rte_tailqs_count = 0; + + TAILQ_FOREACH(t, &rte_tailq_elem_head, next) { + /* second part of register job for "early" tailqs, see + * rte_eal_tailq_register and EAL_REGISTER_TAILQ */ + rte_eal_tailq_update(t); + if (t->head == NULL) { + RTE_LOG(ERR, EAL, + "Cannot initialize tailq: %s\n", t->name); + /* TAILQ_REMOVE not needed, error is already fatal */ + goto fail; + } + } + + return 0; + +fail: + rte_dump_tailq(stderr); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c new file mode 100644 index 00000000..48ef4d6d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DECLARE_PER_LCORE(unsigned , _socket_id); + +unsigned rte_socket_id(void) +{ + return RTE_PER_LCORE(_socket_id); +} + +int +rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return -EINVAL; + + return cfg->lcore_role[lcore_id] == role; +} + +int eal_cpuset_socket_id(rte_cpuset_t *cpusetp) +{ + unsigned cpu = 0; + int socket_id = SOCKET_ID_ANY; + int sid; + + if (cpusetp == NULL) + return SOCKET_ID_ANY; + + do { + if (!CPU_ISSET(cpu, cpusetp)) + continue; + + if (socket_id == SOCKET_ID_ANY) + socket_id = eal_cpu_socket_id(cpu); + + sid = eal_cpu_socket_id(cpu); + if (socket_id != sid) { + socket_id = SOCKET_ID_ANY; + break; + } + + } while (++cpu < RTE_MAX_LCORE); + + return socket_id; +} + +int +rte_thread_set_affinity(rte_cpuset_t *cpusetp) +{ + int s; + unsigned lcore_id; + pthread_t tid; + + tid = pthread_self(); + + s = pthread_setaffinity_np(tid, sizeof(rte_cpuset_t), cpusetp); + if (s != 0) { + RTE_LOG(ERR, EAL, "pthread_setaffinity_np failed\n"); + return -1; + } + + /* store socket_id in TLS for quick access */ + RTE_PER_LCORE(_socket_id) = + eal_cpuset_socket_id(cpusetp); + + /* store cpuset in TLS for quick access */ + memmove(&RTE_PER_LCORE(_cpuset), cpusetp, + sizeof(rte_cpuset_t)); + + lcore_id = rte_lcore_id(); + if (lcore_id != (unsigned)LCORE_ID_ANY) { + /* EAL thread will update lcore_config */ + lcore_config[lcore_id].socket_id = RTE_PER_LCORE(_socket_id); + memmove(&lcore_config[lcore_id].cpuset, cpusetp, + sizeof(rte_cpuset_t)); + } + + return 0; +} + +void +rte_thread_get_affinity(rte_cpuset_t *cpusetp) +{ + assert(cpusetp); + memmove(cpusetp, &RTE_PER_LCORE(_cpuset), + sizeof(rte_cpuset_t)); +} + +int +eal_thread_dump_affinity(char *str, unsigned size) +{ + rte_cpuset_t cpuset; + unsigned cpu; + int ret; + unsigned int out = 0; + + rte_thread_get_affinity(&cpuset); + + for (cpu = 0; cpu < RTE_MAX_LCORE; cpu++) { + if (!CPU_ISSET(cpu, &cpuset)) + continue; + + ret = snprintf(str + out, + size - out, "%u,", cpu); + if (ret < 0 || (unsigned)ret >= size - out) { + /* string will be truncated */ + ret = -1; + goto exit; + } + + out += ret; + } + + ret = 0; +exit: + /* remove the last separator */ + if (out > 0) + str[out - 1] = '\0'; + + return ret; +} + + +struct rte_thread_ctrl_params { + void *(*start_routine)(void *); + void *arg; + pthread_barrier_t configured; +}; + +static void *rte_thread_init(void *arg) +{ + int ret; + struct rte_thread_ctrl_params *params = arg; + void *(*start_routine)(void *) = params->start_routine; + void *routine_arg = params->arg; + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + + return start_routine(routine_arg); +} + +__rte_experimental int +rte_ctrl_thread_create(pthread_t *thread, const char *name, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) +{ + struct rte_thread_ctrl_params *params; + unsigned int lcore_id; + rte_cpuset_t cpuset; + int cpu_found, ret; + + params = malloc(sizeof(*params)); + if (!params) + return -ENOMEM; + + params->start_routine = start_routine; + params->arg = arg; + + pthread_barrier_init(¶ms->configured, NULL, 2); + + ret = pthread_create(thread, attr, rte_thread_init, (void *)params); + if (ret != 0) { + free(params); + return -ret; + } + + if (name != NULL) { + ret = rte_thread_setname(*thread, name); + if (ret < 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for ctrl thread\n"); + } + + cpu_found = 0; + CPU_ZERO(&cpuset); + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (eal_cpu_detected(lcore_id) && + rte_lcore_has_role(lcore_id, ROLE_OFF)) { + CPU_SET(lcore_id, &cpuset); + cpu_found = 1; + } + } + /* if no detected cpu is off, use master core */ + if (!cpu_found) + CPU_SET(rte_get_master_lcore(), &cpuset); + + ret = pthread_setaffinity_np(*thread, sizeof(cpuset), &cpuset); + if (ret < 0) + goto fail; + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + + return 0; + +fail: + if (PTHREAD_BARRIER_SERIAL_THREAD == + pthread_barrier_wait(¶ms->configured)) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + pthread_cancel(*thread); + pthread_join(*thread, NULL); + return -ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c new file mode 100644 index 00000000..2e2b770f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "eal_private.h" + +/* The frequency of the RDTSC timer resolution */ +static uint64_t eal_tsc_resolution_hz; + +/* Pointer to user delay function */ +void (*rte_delay_us)(unsigned int) = NULL; + +void +rte_delay_us_block(unsigned int us) +{ + const uint64_t start = rte_get_timer_cycles(); + const uint64_t ticks = (uint64_t)us * rte_get_timer_hz() / 1E6; + while ((rte_get_timer_cycles() - start) < ticks) + rte_pause(); +} + +uint64_t +rte_get_tsc_hz(void) +{ + return eal_tsc_resolution_hz; +} + +static uint64_t +estimate_tsc_freq(void) +{ + RTE_LOG(WARNING, EAL, "WARNING: TSC frequency estimated roughly" + " - clock timings may be less accurate.\n"); + /* assume that the sleep(1) will sleep for 1 second */ + uint64_t start = rte_rdtsc(); + sleep(1); + return rte_rdtsc() - start; +} + +void +set_tsc_freq(void) +{ + uint64_t freq; + + freq = get_tsc_freq_arch(); + if (!freq) + freq = get_tsc_freq(); + if (!freq) + freq = estimate_tsc_freq(); + + RTE_LOG(DEBUG, EAL, "TSC frequency is ~%" PRIu64 " KHz\n", freq / 1000); + eal_tsc_resolution_hz = freq; +} + +void rte_delay_us_callback_register(void (*userfunc)(unsigned int)) +{ + rte_delay_us = userfunc; +} + +RTE_INIT(rte_timer_init) +{ + /* set rte_delay_us_block as a delay function */ + rte_delay_us_callback_register(rte_delay_us_block); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c new file mode 100644 index 00000000..1b93c5b3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include + +/* UUID packed form */ +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +static void uuid_pack(const struct uuid *uu, rte_uuid_t ptr) +{ + uint32_t tmp; + uint8_t *out = ptr; + + tmp = uu->time_low; + out[3] = (uint8_t) tmp; + tmp >>= 8; + out[2] = (uint8_t) tmp; + tmp >>= 8; + out[1] = (uint8_t) tmp; + tmp >>= 8; + out[0] = (uint8_t) tmp; + + tmp = uu->time_mid; + out[5] = (uint8_t) tmp; + tmp >>= 8; + out[4] = (uint8_t) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (uint8_t) tmp; + tmp >>= 8; + out[6] = (uint8_t) tmp; + + tmp = uu->clock_seq; + out[9] = (uint8_t) tmp; + tmp >>= 8; + out[8] = (uint8_t) tmp; + + memcpy(out+10, uu->node, 6); +} + +static void uuid_unpack(const rte_uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + +bool rte_uuid_is_null(const rte_uuid_t uu) +{ + const uint8_t *cp = uu; + int i; + + for (i = 0; i < 16; i++) + if (*cp++) + return false; + return true; +} + +/* + * rte_uuid_compare() - compare two UUIDs. + */ +int rte_uuid_compare(const rte_uuid_t uu1, const rte_uuid_t uu2) +{ + struct uuid uuid1, uuid2; + + uuid_unpack(uu1, &uuid1); + uuid_unpack(uu2, &uuid2); + +#define UUCMP(u1, u2) \ + do { if (u1 != u2) return (u1 < u2) ? -1 : 1; } while (0) + + UUCMP(uuid1.time_low, uuid2.time_low); + UUCMP(uuid1.time_mid, uuid2.time_mid); + UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); + UUCMP(uuid1.clock_seq, uuid2.clock_seq); +#undef UUCMP + + return memcmp(uuid1.node, uuid2.node, 6); +} + +int rte_uuid_parse(const char *in, rte_uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + + for (i = 0, cp = in; i <= 36; i++, cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i == 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + + for (i = 0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} + +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + + snprintf(out, len, + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h b/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h new file mode 100644 index 00000000..de05febf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +/** + * @file + * Stores functions and path defines for files and directories + * on the filesystem for Linux, that are used by the Linux EAL. + */ + +#ifndef EAL_FILESYSTEM_H +#define EAL_FILESYSTEM_H + +/** Path of rte config file. */ + +#include +#include +#include +#include + +#include +#include "eal_internal_cfg.h" + +/* sets up platform-specific runtime data dir */ +int +eal_create_runtime_dir(void); + +/* returns runtime dir */ +const char * +eal_get_runtime_dir(void); + +#define RUNTIME_CONFIG_FNAME "config" +static inline const char * +eal_runtime_config_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + RUNTIME_CONFIG_FNAME); + return buffer; +} + +/** Path of primary/secondary communication unix socket file. */ +#define MP_SOCKET_FNAME "mp_socket" +static inline const char * +eal_mp_socket_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + MP_SOCKET_FNAME); + return buffer; +} + +#define FBARRAY_NAME_FMT "%s/fbarray_%s" +static inline const char * +eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { + snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name); + return buffer; +} + +/** Path of hugepage info file. */ +#define HUGEPAGE_INFO_FNAME "hugepage_info" +static inline const char * +eal_hugepage_info_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + HUGEPAGE_INFO_FNAME); + return buffer; +} + +/** Path of hugepage data file. */ +#define HUGEPAGE_DATA_FNAME "hugepage_data" +static inline const char * +eal_hugepage_data_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + HUGEPAGE_DATA_FNAME); + return buffer; +} + +/** String format for hugepage map files. */ +#define HUGEFILE_FMT "%s/%smap_%d" +static inline const char * +eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id) +{ + snprintf(buffer, buflen, HUGEFILE_FMT, hugedir, + internal_config.hugefile_prefix, f_id); + buffer[buflen - 1] = '\0'; + return buffer; +} + +/** String format for hugepage map lock files. */ +#define HUGEFILE_LOCK_FMT "%s/map_%d.lock" +static inline const char * +eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id) +{ + snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(), + f_id); + buffer[buflen - 1] = '\0'; + return buffer; +} + +/** define the default filename prefix for the %s values above */ +#define HUGEFILE_PREFIX_DEFAULT "rte" + +/** Function to read a single numeric value from a file on the filesystem. + * Used to read information from files on /sys */ +int eal_parse_sysfs_value(const char *filename, unsigned long *val); + +#endif /* EAL_FILESYSTEM_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h b/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h new file mode 100644 index 00000000..4582f19c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_HUGEPAGES_H +#define EAL_HUGEPAGES_H + +#include +#include +#include + +#define MAX_HUGEPAGE_PATH PATH_MAX + +/** + * Structure used to store informations about hugepages that we mapped + * through the files in hugetlbfs. + */ +struct hugepage_file { + void *orig_va; /**< virtual addr of first mmap() */ + void *final_va; /**< virtual addr of 2nd mmap() */ + uint64_t physaddr; /**< physical addr */ + size_t size; /**< the page size */ + int socket_id; /**< NUMA socket ID */ + int file_id; /**< the '%d' in HUGEFILE_FMT */ + char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */ +}; + +/** + * Read the information on what hugepages are available for the EAL to use, + * clearing out any unused ones. + */ +int eal_hugepage_info_init(void); + +/** + * Read whatever information primary process has shared about hugepages into + * secondary process. + */ +int eal_hugepage_info_read(void); + +#endif /* EAL_HUGEPAGES_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h b/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h new file mode 100644 index 00000000..00ee6e06 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Holds the structures for the eal internal configuration + */ + +#ifndef EAL_INTERNAL_CFG_H +#define EAL_INTERNAL_CFG_H + +#include +#include + +#define MAX_HUGEPAGE_SIZES 3 /**< support up to 3 page sizes */ + +/* + * internal configuration structure for the number, size and + * mount points of hugepages + */ +struct hugepage_info { + uint64_t hugepage_sz; /**< size of a huge page */ + char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */ + uint32_t num_pages[RTE_MAX_NUMA_NODES]; + /**< number of hugepages of that size on each socket */ + int lock_descriptor; /**< file descriptor for hugepage dir */ +}; + +/** + * internal configuration + */ +struct internal_config { + volatile size_t memory; /**< amount of asked memory */ + volatile unsigned force_nchannel; /**< force number of channels */ + volatile unsigned force_nrank; /**< force number of ranks */ + volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */ + unsigned hugepage_unlink; /**< true to unlink backing files */ + volatile unsigned no_pci; /**< true to disable PCI */ + volatile unsigned no_hpet; /**< true to disable HPET */ + volatile unsigned vmware_tsc_map; /**< true to use VMware TSC mapping + * instead of native TSC */ + volatile unsigned no_shconf; /**< true if there is no shared config */ + volatile unsigned in_memory; + /**< true if DPDK should operate entirely in-memory and not create any + * shared files or runtime data. + */ + volatile unsigned create_uio_dev; /**< true to create /dev/uioX devices */ + volatile enum rte_proc_type_t process_type; /**< multi-process proc type */ + /** true to try allocating memory on specific sockets */ + volatile unsigned force_sockets; + volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */ + volatile unsigned force_socket_limits; + volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */ + uintptr_t base_virtaddr; /**< base address to try and reserve memory from */ + volatile unsigned legacy_mem; + /**< true to enable legacy memory behavior (no dynamic allocation, + * IOVA-contiguous segments). + */ + volatile unsigned single_file_segments; + /**< true if storing all pages within single files (per-page-size, + * per-node) non-legacy mode only. + */ + volatile int syslog_facility; /**< facility passed to openlog() */ + /** default interrupt mode for VFIO */ + volatile enum rte_intr_mode vfio_intr_mode; + const char *hugefile_prefix; /**< the base filename of hugetlbfs files */ + const char *hugepage_dir; /**< specific hugetlbfs directory to use */ + const char *user_mbuf_pool_ops_name; + /**< user defined mbuf pool ops name */ + unsigned num_hugepage_sizes; /**< how many sizes on this system */ + struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + volatile unsigned int init_complete; + /**< indicates whether EAL has completed initialization */ +}; +extern struct internal_config internal_config; /**< Global EAL configuration. */ + +void eal_reset_internal_config(struct internal_config *internal_cfg); + +#endif /* EAL_INTERNAL_CFG_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h b/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h new file mode 100644 index 00000000..36bb1a02 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef EAL_MEMALLOC_H +#define EAL_MEMALLOC_H + +#include + +#include +#include + +/* + * Allocate segment of specified page size. + */ +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket); + +/* + * Allocate `n_segs` segments. + * + * Note: `ms` can be NULL. + * + * Note: it is possible to request best-effort allocation by setting `exact` to + * `false`, in which case allocator will return however many pages it managed to + * allocate successfully. + */ +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact); + +/* + * Deallocate segment + */ +int +eal_memalloc_free_seg(struct rte_memseg *ms); + +/* + * Deallocate `n_segs` segments. Returns 0 on successful deallocation of all + * segments, returns -1 on error. Any segments that could have been deallocated, + * will be deallocated even in case of error. + */ +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs); + +/* + * Check if memory pointed to by `start` and of `length` that resides in + * memseg list `msl` is IOVA-contiguous. + */ +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len); + +/* synchronize local memory map to primary process */ +int +eal_memalloc_sync_with_primary(void); + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg); + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg); + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len); + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); + +int +eal_memalloc_init(void); + +#endif /* EAL_MEMALLOC_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_options.h b/src/spdk/dpdk/lib/librte_eal/common/eal_options.h new file mode 100644 index 00000000..96e16678 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_options.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2014 6WIND S.A. + */ + +#ifndef EAL_OPTIONS_H +#define EAL_OPTIONS_H + +enum { + /* long options mapped to a short option */ +#define OPT_HELP "help" + OPT_HELP_NUM = 'h', +#define OPT_PCI_BLACKLIST "pci-blacklist" + OPT_PCI_BLACKLIST_NUM = 'b', +#define OPT_PCI_WHITELIST "pci-whitelist" + OPT_PCI_WHITELIST_NUM = 'w', + + /* first long only option value must be >= 256, so that we won't + * conflict with short options */ + OPT_LONG_MIN_NUM = 256, +#define OPT_BASE_VIRTADDR "base-virtaddr" + OPT_BASE_VIRTADDR_NUM, +#define OPT_CREATE_UIO_DEV "create-uio-dev" + OPT_CREATE_UIO_DEV_NUM, +#define OPT_FILE_PREFIX "file-prefix" + OPT_FILE_PREFIX_NUM, +#define OPT_HUGE_DIR "huge-dir" + OPT_HUGE_DIR_NUM, +#define OPT_HUGE_UNLINK "huge-unlink" + OPT_HUGE_UNLINK_NUM, +#define OPT_LCORES "lcores" + OPT_LCORES_NUM, +#define OPT_LOG_LEVEL "log-level" + OPT_LOG_LEVEL_NUM, +#define OPT_MASTER_LCORE "master-lcore" + OPT_MASTER_LCORE_NUM, +#define OPT_MBUF_POOL_OPS_NAME "mbuf-pool-ops-name" + OPT_MBUF_POOL_OPS_NAME_NUM, +#define OPT_PROC_TYPE "proc-type" + OPT_PROC_TYPE_NUM, +#define OPT_NO_HPET "no-hpet" + OPT_NO_HPET_NUM, +#define OPT_NO_HUGE "no-huge" + OPT_NO_HUGE_NUM, +#define OPT_NO_PCI "no-pci" + OPT_NO_PCI_NUM, +#define OPT_NO_SHCONF "no-shconf" + OPT_NO_SHCONF_NUM, +#define OPT_IN_MEMORY "in-memory" + OPT_IN_MEMORY_NUM, +#define OPT_SOCKET_MEM "socket-mem" + OPT_SOCKET_MEM_NUM, +#define OPT_SOCKET_LIMIT "socket-limit" + OPT_SOCKET_LIMIT_NUM, +#define OPT_SYSLOG "syslog" + OPT_SYSLOG_NUM, +#define OPT_VDEV "vdev" + OPT_VDEV_NUM, +#define OPT_VFIO_INTR "vfio-intr" + OPT_VFIO_INTR_NUM, +#define OPT_VMWARE_TSC_MAP "vmware-tsc-map" + OPT_VMWARE_TSC_MAP_NUM, +#define OPT_LEGACY_MEM "legacy-mem" + OPT_LEGACY_MEM_NUM, +#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" + OPT_SINGLE_FILE_SEGMENTS_NUM, + OPT_LONG_MAX_NUM +}; + +extern const char eal_short_options[]; +extern const struct option eal_long_options[]; + +int eal_parse_common_option(int opt, const char *argv, + struct internal_config *conf); +int eal_option_device_parse(void); +int eal_adjust_config(struct internal_config *internal_cfg); +int eal_check_common_options(struct internal_config *internal_cfg); +void eal_common_usage(void); +enum rte_proc_type_t eal_proc_type_detect(void); +int eal_plugins_init(void); + +#endif /* EAL_OPTIONS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_private.h b/src/spdk/dpdk/lib/librte_eal/common/eal_private.h new file mode 100644 index 00000000..4f809a83 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_private.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _EAL_PRIVATE_H_ +#define _EAL_PRIVATE_H_ + +#include +#include +#include + +#include + +/** + * Initialize the memzone subsystem (private to eal). + * + * @return + * - 0 on success + * - Negative on error + */ +int rte_eal_memzone_init(void); + +/** + * Common log initialization function (private to eal). Determines + * where log data is written when no call to rte_openlog_stream is + * in effect. + * + * @param default_log + * The default log stream to be used. + * @return + * - 0 on success + * - Negative on error + */ +void eal_log_set_default(FILE *default_log); + +/** + * Fill configuration with number of physical and logical processors + * + * This function is private to EAL. + * + * Parse /proc/cpuinfo to get the number of physical and logical + * processors on the machine. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_cpu_init(void); + +/** + * Create memseg lists + * + * This function is private to EAL. + * + * Preallocate virtual memory. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memseg_init(void); + +/** + * Map memory + * + * This function is private to EAL. + * + * Fill configuration structure with these infos, and return 0 on success. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memory_init(void); + +/** + * Configure timers + * + * This function is private to EAL. + * + * Mmap memory areas used by HPET (high precision event timer) that will + * provide our time reference, and configure the TSC frequency also for it + * to be used as a reference. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_timer_init(void); + +/** + * Init the default log stream + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_log_init(const char *id, int facility); + +/** + * Save the log regexp for later + */ +int rte_log_save_regexp(const char *type, int priority); +int rte_log_save_pattern(const char *pattern, int priority); + +/** + * Init tail queues for non-EAL library structures. This is to allow + * the rings, mempools, etc. lists to be shared among multiple processes + * + * This function is private to EAL + * + * @return + * 0 on success, negative on error + */ +int rte_eal_tailqs_init(void); + +/** + * Init interrupt handling. + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_intr_init(void); + +/** + * Init alarm mechanism. This is to allow a callback be called after + * specific time. + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_alarm_init(void); + +/** + * Function is to check if the kernel module(like, vfio, vfio_iommu_type1, + * etc.) loaded. + * + * @param module_name + * The module's name which need to be checked + * + * @return + * -1 means some error happens(NULL pointer or open failure) + * 0 means the module not loaded + * 1 means the module loaded + */ +int rte_eal_check_module(const char *module_name); + +/** + * Get virtual area of specified size from the OS. + * + * This function is private to the EAL. + * + * @param requested_addr + * Address where to request address space. + * @param size + * Size of requested area. + * @param page_sz + * Page size on which to align requested virtual area. + * @param flags + * EAL_VIRTUAL_AREA_* flags. + * @param mmap_flags + * Extra flags passed directly to mmap(). + * + * @return + * Virtual area address if successful. + * NULL if unsuccessful. + */ + +#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0) +/**< don't fail if cannot get exact requested address. */ +#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1) +/**< try getting smaller sized (decrement by page size) virtual areas if cannot + * get area of requested size. + */ +#define EAL_VIRTUAL_AREA_UNMAP (1 << 2) +/**< immediately unmap reserved virtual area. */ +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags); + +/** + * Get cpu core_id. + * + * This function is private to the EAL. + */ +unsigned eal_cpu_core_id(unsigned lcore_id); + +/** + * Check if cpu is present. + * + * This function is private to the EAL. + */ +int eal_cpu_detected(unsigned lcore_id); + +/** + * Set TSC frequency from precise value or estimation + * + * This function is private to the EAL. + */ +void set_tsc_freq(void); + +/** + * Get precise TSC frequency from system + * + * This function is private to the EAL. + */ +uint64_t get_tsc_freq(void); + +/** + * Get TSC frequency if the architecture supports. + * + * This function is private to the EAL. + * + * @return + * The number of TSC cycles in one second. + * Returns zero if the architecture support is not available. + */ +uint64_t get_tsc_freq_arch(void); + +/** + * Prepare physical memory mapping + * i.e. hugepages on Linux and + * contigmem on BSD. + * + * This function is private to the EAL. + */ +int rte_eal_hugepage_init(void); + +/** + * Creates memory mapping in secondary process + * i.e. hugepages on Linux and + * contigmem on BSD. + * + * This function is private to the EAL. + */ +int rte_eal_hugepage_attach(void); + +/** + * Find a bus capable of identifying a device. + * + * @param str + * A device identifier (PCI address, virtual PMD name, ...). + * + * @return + * A valid bus handle if found. + * NULL if no bus is able to parse this device. + */ +struct rte_bus *rte_bus_find_by_device_name(const char *str); + +/** + * Create the unix channel for primary/secondary communication. + * + * @return + * 0 on success; + * (<0) on failure. + */ + +int rte_mp_channel_init(void); + +/** + * Internal Executes all the user application registered callbacks for + * the specific device. It is for DPDK internal user only. User + * application should not call it directly. + * + * @param device_name + * The device name. + * @param event + * the device event type. + */ +void dev_callback_process(char *device_name, enum rte_dev_event_type event); + +/** + * @internal + * Parse a device string and store its information in an + * rte_devargs structure. + * + * A device description is split by layers of abstraction of the device: + * bus, class and driver. Each layer will offer a set of properties that + * can be applied either to configure or recognize a device. + * + * This function will parse those properties and prepare the rte_devargs + * to be given to each layers for processing. + * + * Note: if the "data" field of the devargs points to devstr, + * then no dynamic allocation is performed and the rte_devargs + * can be safely discarded. + * + * Otherwise ``data`` will hold a workable copy of devstr, that will be + * used by layers descriptors within rte_devargs. In this case, + * any rte_devargs should be cleaned-up before being freed. + * + * @param da + * rte_devargs structure to fill. + * + * @param devstr + * Device string. + * + * @return + * 0 on success. + * Negative errno values on error (rte_errno is set). + */ +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr); + +#endif /* _EAL_PRIVATE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h b/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h new file mode 100644 index 00000000..2d30b19b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_THREAD_H +#define EAL_THREAD_H + +#include + +/** + * basic loop of thread, called for each thread by eal_init(). + * + * @param arg + * opaque pointer + */ +__attribute__((noreturn)) void *eal_thread_loop(void *arg); + +/** + * Init per-lcore info for master thread + * + * @param lcore_id + * identifier of master lcore + */ +void eal_thread_init_master(unsigned lcore_id); + +/** + * Get the NUMA socket id from cpu id. + * This function is private to EAL. + * + * @param cpu_id + * The logical process id. + * @return + * socket_id or SOCKET_ID_ANY + */ +unsigned eal_cpu_socket_id(unsigned cpu_id); + +/** + * Get the NUMA socket id from cpuset. + * This function is private to EAL. + * + * @param cpusetp + * The point to a valid cpu set. + * @return + * socket_id or SOCKET_ID_ANY + */ +int eal_cpuset_socket_id(rte_cpuset_t *cpusetp); + +/** + * Default buffer size to use with eal_thread_dump_affinity() + */ +#define RTE_CPU_AFFINITY_STR_LEN 256 + +/** + * Dump the current pthread cpuset. + * This function is private to EAL. + * + * Note: + * If the dump size is greater than the size of given buffer, + * the string will be truncated and with '\0' at the end. + * + * @param str + * The string buffer the cpuset will dump to. + * @param size + * The string buffer size. + * @return + * 0 for success, -1 if truncation happens. + */ +int +eal_thread_dump_affinity(char *str, unsigned size); + +#endif /* EAL_THREAD_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/meson.build b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/meson.build new file mode 100644 index 00000000..77893fa3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/meson.build @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation. + +install_headers( + 'rte_atomic_32.h', + 'rte_atomic_64.h', + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags_32.h', + 'rte_cpuflags_64.h', + 'rte_cpuflags.h', + 'rte_cycles_32.h', + 'rte_cycles_64.h', + 'rte_cycles.h', + 'rte_io_64.h', + 'rte_io.h', + 'rte_memcpy_32.h', + 'rte_memcpy_64.h', + 'rte_memcpy.h', + 'rte_pause_32.h', + 'rte_pause_64.h', + 'rte_pause.h', + 'rte_prefetch_32.h', + 'rte_prefetch_64.h', + 'rte_prefetch.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', + subdir: get_option('include_subdir_arch')) diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic.h new file mode 100644 index 00000000..40e14e56 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_ATOMIC_ARM_H_ +#define _RTE_ATOMIC_ARM_H_ + +#ifdef RTE_ARCH_64 +#include +#else +#include +#endif + +#endif /* _RTE_ATOMIC_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h new file mode 100644 index 00000000..859562e5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_ATOMIC_ARM32_H_ +#define _RTE_ATOMIC_ARM32_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_atomic.h" + +/** + * General memory barrier. + * + * Guarantees that the LOAD and STORE operations generated before the + * barrier occur before the LOAD and STORE operations generated after. + */ +#define rte_mb() __sync_synchronize() + +/** + * Write memory barrier. + * + * Guarantees that the STORE operations generated before the barrier + * occur before the STORE operations generated after. + */ +#define rte_wmb() do { asm volatile ("dmb st" : : : "memory"); } while (0) + +/** + * Read memory barrier. + * + * Guarantees that the LOAD operations generated before the barrier + * occur before the LOAD operations generated after. + */ +#define rte_rmb() __sync_synchronize() + +#define rte_smp_mb() rte_mb() + +#define rte_smp_wmb() rte_wmb() + +#define rte_smp_rmb() rte_rmb() + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_wmb() + +#define rte_io_rmb() rte_rmb() + +#define rte_cio_wmb() rte_wmb() + +#define rte_cio_rmb() rte_rmb() + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h new file mode 100644 index 00000000..97060e44 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_ATOMIC_ARM64_H_ +#define _RTE_ATOMIC_ARM64_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_atomic.h" + +#define dsb(opt) asm volatile("dsb " #opt : : : "memory") +#define dmb(opt) asm volatile("dmb " #opt : : : "memory") + +#define rte_mb() dsb(sy) + +#define rte_wmb() dsb(st) + +#define rte_rmb() dsb(ld) + +#define rte_smp_mb() dmb(ish) + +#define rte_smp_wmb() dmb(ishst) + +#define rte_smp_rmb() dmb(ishld) + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_wmb() + +#define rte_io_rmb() rte_rmb() + +#define rte_cio_wmb() dmb(oshst) + +#define rte_cio_rmb() dmb(oshld) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_byteorder.h new file mode 100644 index 00000000..9ec4a975 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_byteorder.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_BYTEORDER_ARM_H_ +#define _RTE_BYTEORDER_ARM_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include "generic/rte_byteorder.h" + +/* fix missing __builtin_bswap16 for gcc older then 4.8 */ +#if !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + +static inline uint16_t rte_arch_bswap16(uint16_t _x) +{ + uint16_t x = _x; + + asm volatile ("rev16 %w0,%w1" + : "=r" (x) + : "r" (x) + ); + return x; +} + +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) +#endif + +/* ARM architecture is bi-endian (both big and little). */ +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + +#define rte_cpu_to_le_16(x) (x) +#define rte_cpu_to_le_32(x) (x) +#define rte_cpu_to_le_64(x) (x) + +#define rte_cpu_to_be_16(x) rte_bswap16(x) +#define rte_cpu_to_be_32(x) rte_bswap32(x) +#define rte_cpu_to_be_64(x) rte_bswap64(x) + +#define rte_le_to_cpu_16(x) (x) +#define rte_le_to_cpu_32(x) (x) +#define rte_le_to_cpu_64(x) (x) + +#define rte_be_to_cpu_16(x) rte_bswap16(x) +#define rte_be_to_cpu_32(x) rte_bswap32(x) +#define rte_be_to_cpu_64(x) rte_bswap64(x) + +#else /* RTE_BIG_ENDIAN */ + +#define rte_cpu_to_le_16(x) rte_bswap16(x) +#define rte_cpu_to_le_32(x) rte_bswap32(x) +#define rte_cpu_to_le_64(x) rte_bswap64(x) + +#define rte_cpu_to_be_16(x) (x) +#define rte_cpu_to_be_32(x) (x) +#define rte_cpu_to_be_64(x) (x) + +#define rte_le_to_cpu_16(x) rte_bswap16(x) +#define rte_le_to_cpu_32(x) rte_bswap32(x) +#define rte_le_to_cpu_64(x) rte_bswap64(x) + +#define rte_be_to_cpu_16(x) (x) +#define rte_be_to_cpu_32(x) (x) +#define rte_be_to_cpu_64(x) (x) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BYTEORDER_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h new file mode 100644 index 00000000..022e7da5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CPUFLAGS_ARM_H_ +#define _RTE_CPUFLAGS_ARM_H_ + +#ifdef RTE_ARCH_64 +#include +#else +#include +#endif + +#endif /* _RTE_CPUFLAGS_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h new file mode 100644 index 00000000..b5347be1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CPUFLAGS_ARM32_H_ +#define _RTE_CPUFLAGS_ARM32_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Enumeration of all CPU features supported + */ +enum rte_cpu_flag_t { + RTE_CPUFLAG_SWP = 0, + RTE_CPUFLAG_HALF, + RTE_CPUFLAG_THUMB, + RTE_CPUFLAG_A26BIT, + RTE_CPUFLAG_FAST_MULT, + RTE_CPUFLAG_FPA, + RTE_CPUFLAG_VFP, + RTE_CPUFLAG_EDSP, + RTE_CPUFLAG_JAVA, + RTE_CPUFLAG_IWMMXT, + RTE_CPUFLAG_CRUNCH, + RTE_CPUFLAG_THUMBEE, + RTE_CPUFLAG_NEON, + RTE_CPUFLAG_VFPv3, + RTE_CPUFLAG_VFPv3D16, + RTE_CPUFLAG_TLS, + RTE_CPUFLAG_VFPv4, + RTE_CPUFLAG_IDIVA, + RTE_CPUFLAG_IDIVT, + RTE_CPUFLAG_VFPD32, + RTE_CPUFLAG_LPAE, + RTE_CPUFLAG_EVTSTRM, + RTE_CPUFLAG_AES, + RTE_CPUFLAG_PMULL, + RTE_CPUFLAG_SHA1, + RTE_CPUFLAG_SHA2, + RTE_CPUFLAG_CRC32, + RTE_CPUFLAG_V7L, + /* The last item */ + RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_64.h new file mode 100644 index 00000000..95cc0147 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cpuflags_64.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_CPUFLAGS_ARM64_H_ +#define _RTE_CPUFLAGS_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Enumeration of all CPU features supported + */ +enum rte_cpu_flag_t { + RTE_CPUFLAG_FP = 0, + RTE_CPUFLAG_NEON, + RTE_CPUFLAG_EVTSTRM, + RTE_CPUFLAG_AES, + RTE_CPUFLAG_PMULL, + RTE_CPUFLAG_SHA1, + RTE_CPUFLAG_SHA2, + RTE_CPUFLAG_CRC32, + RTE_CPUFLAG_ATOMICS, + RTE_CPUFLAG_AARCH64, + /* The last item */ + RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles.h new file mode 100644 index 00000000..e8ffa894 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CYCLES_ARM_H_ +#define _RTE_CYCLES_ARM_H_ + +#ifdef RTE_ARCH_64 +#include +#else +#include +#endif + +#endif /* _RTE_CYCLES_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h new file mode 100644 index 00000000..c4f974fe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_CYCLES_ARM32_H_ +#define _RTE_CYCLES_ARM32_H_ + +/* ARM v7 does not have suitable source of clock signals. The only clock counter + available in the core is 32 bit wide. Therefore it is unsuitable as the + counter overlaps every few seconds and probably is not accessible by + userspace programs. Therefore we use clock_gettime(CLOCK_MONOTONIC_RAW) to + simulate counter running at 1GHz. +*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +/** + * Read the time base register. + * + * @return + * The time base for this lcore. + */ +#ifndef RTE_ARM_EAL_RDTSC_USE_PMU + +/** + * This call is easily portable to any ARM architecture, however, + * it may be damn slow and inprecise for some tasks. + */ +static inline uint64_t +__rte_rdtsc_syscall(void) +{ + struct timespec val; + uint64_t v; + + while (clock_gettime(CLOCK_MONOTONIC_RAW, &val) != 0) + /* no body */; + + v = (uint64_t) val.tv_sec * 1000000000LL; + v += (uint64_t) val.tv_nsec; + return v; +} +#define rte_rdtsc __rte_rdtsc_syscall + +#else + +/** + * This function requires to configure the PMCCNTR and enable + * userspace access to it: + * + * asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r"(1)); + * asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(29)); + * asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x8000000f)); + * + * which is possible only from the priviledged mode (kernel space). + */ +static inline uint64_t +__rte_rdtsc_pmccntr(void) +{ + unsigned tsc; + uint64_t final_tsc; + + /* Read PMCCNTR */ + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(tsc)); + /* 1 tick = 64 clocks */ + final_tsc = ((uint64_t)tsc) << 6; + + return (uint64_t)final_tsc; +} +#define rte_rdtsc __rte_rdtsc_pmccntr + +#endif /* RTE_ARM_EAL_RDTSC_USE_PMU */ + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h new file mode 100644 index 00000000..68e7c733 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_CYCLES_ARM64_H_ +#define _RTE_CYCLES_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +/** + * Read the time base register. + * + * @return + * The time base for this lcore. + */ +#ifndef RTE_ARM_EAL_RDTSC_USE_PMU +/** + * This call is portable to any ARMv8 architecture, however, typically + * cntvct_el0 runs at <= 100MHz and it may be imprecise for some tasks. + */ +static inline uint64_t +rte_rdtsc(void) +{ + uint64_t tsc; + + asm volatile("mrs %0, cntvct_el0" : "=r" (tsc)); + return tsc; +} +#else +/** + * This is an alternative method to enable rte_rdtsc() with high resolution + * PMU cycles counter.The cycle counter runs at cpu frequency and this scheme + * uses ARMv8 PMU subsystem to get the cycle counter at userspace, However, + * access to PMU cycle counter from user space is not enabled by default in + * arm64 linux kernel. + * It is possible to enable cycle counter at user space access by configuring + * the PMU from the privileged mode (kernel space). + * + * asm volatile("msr pmintenset_el1, %0" : : "r" ((u64)(0 << 31))); + * asm volatile("msr pmcntenset_el0, %0" :: "r" BIT(31)); + * asm volatile("msr pmuserenr_el0, %0" : : "r"(BIT(0) | BIT(2))); + * asm volatile("mrs %0, pmcr_el0" : "=r" (val)); + * val |= (BIT(0) | BIT(2)); + * isb(); + * asm volatile("msr pmcr_el0, %0" : : "r" (val)); + * + */ +static inline uint64_t +rte_rdtsc(void) +{ + uint64_t tsc; + + asm volatile("mrs %0, pmccntr_el0" : "=r"(tsc)); + return tsc; +} +#endif + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io.h new file mode 100644 index 00000000..f4e66e6b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_ARM_H_ +#define _RTE_IO_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef RTE_ARCH_64 +#include "rte_io_64.h" +#else +#include "generic/rte_io.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io_64.h new file mode 100644 index 00000000..e5346240 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_io_64.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_ARM64_H_ +#define _RTE_IO_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#define RTE_OVERRIDE_IO_H + +#include "generic/rte_io.h" +#include "rte_atomic_64.h" + +static __rte_always_inline uint8_t +rte_read8_relaxed(const volatile void *addr) +{ + uint8_t val; + + asm volatile( + "ldrb %w[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline uint16_t +rte_read16_relaxed(const volatile void *addr) +{ + uint16_t val; + + asm volatile( + "ldrh %w[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline uint32_t +rte_read32_relaxed(const volatile void *addr) +{ + uint32_t val; + + asm volatile( + "ldr %w[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline uint64_t +rte_read64_relaxed(const volatile void *addr) +{ + uint64_t val; + + asm volatile( + "ldr %x[val], [%x[addr]]" + : [val] "=r" (val) + : [addr] "r" (addr)); + return val; +} + +static __rte_always_inline void +rte_write8_relaxed(uint8_t val, volatile void *addr) +{ + asm volatile( + "strb %w[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline void +rte_write16_relaxed(uint16_t val, volatile void *addr) +{ + asm volatile( + "strh %w[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline void +rte_write32_relaxed(uint32_t val, volatile void *addr) +{ + asm volatile( + "str %w[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline void +rte_write64_relaxed(uint64_t val, volatile void *addr) +{ + asm volatile( + "str %x[val], [%x[addr]]" + : + : [val] "r" (val), [addr] "r" (addr)); +} + +static __rte_always_inline uint8_t +rte_read8(const volatile void *addr) +{ + uint8_t val; + val = rte_read8_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint16_t +rte_read16(const volatile void *addr) +{ + uint16_t val; + val = rte_read16_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint32_t +rte_read32(const volatile void *addr) +{ + uint32_t val; + val = rte_read32_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint64_t +rte_read64(const volatile void *addr) +{ + uint64_t val; + val = rte_read64_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline void +rte_write8(uint8_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write8_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write16(uint16_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write16_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write32(uint32_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write32_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write64(uint64_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write64_relaxed(value, addr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy.h new file mode 100644 index 00000000..47dea9a8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_MEMCPY_ARM_H_ +#define _RTE_MEMCPY_ARM_H_ + +#ifdef RTE_ARCH_64 +#include +#else +#include +#endif + +#endif /* _RTE_MEMCPY_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h new file mode 100644 index 00000000..eb02c3b4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_MEMCPY_ARM32_H_ +#define _RTE_MEMCPY_ARM32_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_memcpy.h" + +#ifdef RTE_ARCH_ARM_NEON_MEMCPY + +#ifndef RTE_MACHINE_CPUFLAG_NEON +#error "Cannot optimize memcpy by NEON as the CPU seems to not support this" +#endif + +/* ARM NEON Intrinsics are used to copy data */ +#include + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + vst1q_u8(dst, vld1q_u8(src)); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + asm volatile ( + "vld1.8 {d0-d3}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : : "memory", "d0", "d1", "d2", "d3"); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d5}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d5}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5"); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d7}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d7}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + asm volatile ("pld [%0, #64]" : : "r" (src)); + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d7}, [%0]!\n\t" + "vld1.8 {d8-d11}, [%0]!\n\t" + "vld1.8 {d12-d15}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d7}, [%1]!\n\t" + "vst1.8 {d8-d11}, [%1]!\n\t" + "vst1.8 {d12-d15}, [%1]\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + asm volatile ("pld [%0, #64]" : : "r" (src)); + asm volatile ("pld [%0, #128]" : : "r" (src)); + asm volatile ("pld [%0, #192]" : : "r" (src)); + asm volatile ("pld [%0, #256]" : : "r" (src)); + asm volatile ("pld [%0, #320]" : : "r" (src)); + asm volatile ("pld [%0, #384]" : : "r" (src)); + asm volatile ("pld [%0, #448]" : : "r" (src)); + asm volatile ( + "vld1.8 {d0-d3}, [%0]!\n\t" + "vld1.8 {d4-d7}, [%0]!\n\t" + "vld1.8 {d8-d11}, [%0]!\n\t" + "vld1.8 {d12-d15}, [%0]!\n\t" + "vld1.8 {d16-d19}, [%0]!\n\t" + "vld1.8 {d20-d23}, [%0]!\n\t" + "vld1.8 {d24-d27}, [%0]!\n\t" + "vld1.8 {d28-d31}, [%0]\n\t" + "vst1.8 {d0-d3}, [%1]!\n\t" + "vst1.8 {d4-d7}, [%1]!\n\t" + "vst1.8 {d8-d11}, [%1]!\n\t" + "vst1.8 {d12-d15}, [%1]!\n\t" + "vst1.8 {d16-d19}, [%1]!\n\t" + "vst1.8 {d20-d23}, [%1]!\n\t" + "vst1.8 {d24-d27}, [%1]!\n\t" + "vst1.8 {d28-d31}, [%1]!\n\t" + : "+r" (src), "+r" (dst) + : + : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +} + +#define rte_memcpy(dst, src, n) \ + __extension__ ({ \ + (__builtin_constant_p(n)) ? \ + memcpy((dst), (src), (n)) : \ + rte_memcpy_func((dst), (src), (n)); }) + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* We can't copy < 16 bytes using XMM registers so do it manually. */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + dst = (uint8_t *)dst + 1; + src = (const uint8_t *)src + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst = (uint16_t *)dst + 1; + src = (const uint16_t *)src + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + } + if (n & 0x08) { + /* ARMv7 can not handle unaligned access to long long + * (uint64_t). Therefore two uint32_t operations are + * used. + */ + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + *(uint32_t *)dst = *(const uint32_t *)src; + } + return ret; + } + + /* Special fast cases for <= 128 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + + if (n <= 128) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /* + * For large copies > 128 bytes. This combination of 256, 64 and 16 byte + * copies was found to be faster than doing 128 and 32 byte copies as + * well. + */ + for ( ; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /* + * We split the remaining bytes (which will be less than 256) into + * 64byte (2^6) chunks. + * Using incrementing integers in the case labels of a switch statement + * encourages the compiler to use a jump table. To get incrementing + * integers, we shift the 2 relevant bits to the LSB position to first + * get decrementing integers, and then subtract. + */ + switch (3 - (n >> 6)) { + case 0x00: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x01: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x02: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + default: + break; + } + + /* + * We split the remaining bytes (which will be less than 64) into + * 16byte (2^4) chunks, using the same switch structure as above. + */ + switch (3 - (n >> 4)) { + case 0x00: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x01: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x02: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + default: + break; + } + + /* Copy any remaining bytes, without going beyond end of buffers */ + if (n != 0) + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; +} + +#else + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 16); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 32); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 48); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 64); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 128); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 256); +} + +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + return memcpy(dst, src, n); +} + +#endif /* RTE_ARCH_ARM_NEON_MEMCPY */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h new file mode 100644 index 00000000..beb97a71 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h @@ -0,0 +1,372 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_MEMCPY_ARM64_H_ +#define _RTE_MEMCPY_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "generic/rte_memcpy.h" + +#ifdef RTE_ARCH_ARM64_MEMCPY +#include +#include + +/* + * The memory copy performance differs on different AArch64 micro-architectures. + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy() + * performance compared to old glibc versions. It's always suggested to use a + * more recent glibc if possible, from which the entire system can get benefit. + * + * This implementation improves memory copy on some aarch64 micro-architectures, + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not + * always providing better performance than memcpy() so users need to run unit + * test "memcpy_perf_autotest" and customize parameters in customization section + * below for best performance. + * + * Compiler version will also impact the rte_memcpy() performance. It's observed + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can + * provide better performance than GCC 4.8.5 compiled binaries. + */ + +/************************************** + * Beginning of customization section + **************************************/ +#ifndef RTE_ARM64_MEMCPY_ALIGN_MASK +#define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1) +#endif + +#ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN +/* Only src unalignment will be treated as unaligned copy */ +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \ + ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK) +#else +/* Both dst and src unalignment will be treated as unaligned copy */ +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \ + (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK) +#endif + + +/* + * If copy size is larger than threshold, memcpy() will be used. + * Run "memcpy_perf_autotest" to determine the proper threshold. + */ +#ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD +#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \ +(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ +n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) +#else +#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \ +(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src)) +#endif +#ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD +#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \ +(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ +n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD) +#else +#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \ +(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src)) +#endif +/* + * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform. + */ +#if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \ +|| defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD) +#define USE_RTE_MEMCPY(dst, src, n) \ +(USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n)) +#else +#define USE_RTE_MEMCPY(dst, src, n) (1) +#endif +/************************************** + * End of customization section + **************************************/ + + +#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK) +#if (GCC_VERSION < 50400) +#warning "The GCC version is quite old, which may result in sub-optimal \ +performance of the compiled code. It is suggested that at least GCC 5.4.0 \ +be used." +#endif +#endif + +static __rte_always_inline +void rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + *dst128 = *src128; +} + +static __rte_always_inline +void rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + const __uint128_t x0 = src128[0], x1 = src128[1]; + dst128[0] = x0; + dst128[1] = x1; +} + +static __rte_always_inline +void rte_mov48(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2]; + dst128[0] = x0; + dst128[1] = x1; + dst128[2] = x2; +} + +static __rte_always_inline +void rte_mov64(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + const __uint128_t + x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3]; + dst128[0] = x0; + dst128[1] = x1; + dst128[2] = x2; + dst128[3] = x3; +} + +static __rte_always_inline +void rte_mov128(uint8_t *dst, const uint8_t *src) +{ + __uint128_t *dst128 = (__uint128_t *)dst; + const __uint128_t *src128 = (const __uint128_t *)src; + /* Keep below declaration & copy sequence for optimized instructions */ + const __uint128_t + x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3]; + dst128[0] = x0; + __uint128_t x4 = src128[4]; + dst128[1] = x1; + __uint128_t x5 = src128[5]; + dst128[2] = x2; + __uint128_t x6 = src128[6]; + dst128[3] = x3; + __uint128_t x7 = src128[7]; + dst128[4] = x4; + dst128[5] = x5; + dst128[6] = x6; + dst128[7] = x7; +} + +static __rte_always_inline +void rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov128(dst, src); + rte_mov128(dst + 128, src + 128); +} + +static __rte_always_inline void +rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n) +{ + if (n & 0x08) { + /* copy 8 ~ 15 bytes */ + *(uint64_t *)dst = *(const uint64_t *)src; + *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n); + } else if (n & 0x04) { + /* copy 4 ~ 7 bytes */ + *(uint32_t *)dst = *(const uint32_t *)src; + *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n); + } else if (n & 0x02) { + /* copy 2 ~ 3 bytes */ + *(uint16_t *)dst = *(const uint16_t *)src; + *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n); + } else if (n & 0x01) { + /* copy 1 byte */ + *dst = *src; + } +} + +static __rte_always_inline +void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n) +{ + if (n < 64) { + if (n == 16) { + rte_mov16(dst, src); + } else if (n <= 32) { + rte_mov16(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else if (n <= 48) { + rte_mov32(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else { + rte_mov48(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } + } else { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + if (n > 48 + 64) + rte_mov64(dst - 64 + n, src - 64 + n); + else if (n > 32 + 64) + rte_mov48(dst - 48 + n, src - 48 + n); + else if (n > 16 + 64) + rte_mov32(dst - 32 + n, src - 32 + n); + else if (n > 64) + rte_mov16(dst - 16 + n, src - 16 + n); + } +} + +static __rte_always_inline +void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n) +{ + do { + rte_mov128(dst, src); + src += 128; + dst += 128; + n -= 128; + } while (likely(n >= 128)); + + if (likely(n)) { + if (n <= 16) + rte_mov16(dst - 16 + n, src - 16 + n); + else if (n <= 32) + rte_mov32(dst - 32 + n, src - 32 + n); + else if (n <= 48) + rte_mov48(dst - 48 + n, src - 48 + n); + else if (n <= 64) + rte_mov64(dst - 64 + n, src - 64 + n); + else + rte_memcpy_ge16_lt128(dst, src, n); + } +} + +static __rte_always_inline +void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n) +{ + if (n == 16) { + rte_mov16(dst, src); + } else if (n <= 32) { + rte_mov16(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else if (n <= 48) { + rte_mov32(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } else { + rte_mov48(dst, src); + rte_mov16(dst - 16 + n, src - 16 + n); + } +} + +static __rte_always_inline +void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n) +{ + do { + rte_mov64(dst, src); + src += 64; + dst += 64; + n -= 64; + } while (likely(n >= 64)); + + if (likely(n)) { + if (n <= 16) + rte_mov16(dst - 16 + n, src - 16 + n); + else if (n <= 32) + rte_mov32(dst - 32 + n, src - 32 + n); + else if (n <= 48) + rte_mov48(dst - 48 + n, src - 48 + n); + else + rte_mov64(dst - 64 + n, src - 64 + n); + } +} + +#if RTE_CACHE_LINE_SIZE >= 128 +static __rte_always_inline +void *rte_memcpy(void *dst, const void *src, size_t n) +{ + if (n < 16) { + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + if (n < 128) { + rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + __builtin_prefetch(src, 0, 0); + __builtin_prefetch(dst, 1, 0); + if (likely(USE_RTE_MEMCPY(dst, src, n))) { + rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } else + return memcpy(dst, src, n); +} + +#else +static __rte_always_inline +void *rte_memcpy(void *dst, const void *src, size_t n) +{ + if (n < 16) { + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + if (n < 64) { + rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } + __builtin_prefetch(src, 0, 0); + __builtin_prefetch(dst, 1, 0); + if (likely(USE_RTE_MEMCPY(dst, src, n))) { + rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n); + return dst; + } else + return memcpy(dst, src, n); +} +#endif /* RTE_CACHE_LINE_SIZE >= 128 */ + +#else +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 16); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 32); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 48); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 64); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 128); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 256); +} + +#define rte_memcpy(d, s, n) memcpy((d), (s), (n)) + +#endif /* RTE_ARCH_ARM64_MEMCPY */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_ARM_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause.h new file mode 100644 index 00000000..6c7002ad --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_ARM_H_ +#define _RTE_PAUSE_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef RTE_ARCH_64 +#include +#else +#include +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_32.h new file mode 100644 index 00000000..d4768c7a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_32.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_ARM32_H_ +#define _RTE_PAUSE_ARM32_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_pause.h" + +static inline void rte_pause(void) +{ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_64.h new file mode 100644 index 00000000..93895d3e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_pause_64.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_ARM64_H_ +#define _RTE_PAUSE_ARM64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_pause.h" + +static inline void rte_pause(void) +{ + asm volatile("yield" ::: "memory"); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch.h new file mode 100644 index 00000000..27870c2a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_PREFETCH_ARM_H_ +#define _RTE_PREFETCH_ARM_H_ + +#ifdef RTE_ARCH_64 +#include +#else +#include +#endif + +#endif /* _RTE_PREFETCH_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h new file mode 100644 index 00000000..e53420a0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_PREFETCH_ARM32_H_ +#define _RTE_PREFETCH_ARM32_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("pld [%0]" : : "r" (p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("pld [%0]" : : "r" (p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("pld [%0]" : : "r" (p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + /* non-temporal version not available, fallback to rte_prefetch0 */ + rte_prefetch0(p); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_ARM32_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_64.h new file mode 100644 index 00000000..fc2b391a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_prefetch_64.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_PREFETCH_ARM_64_H_ +#define _RTE_PREFETCH_ARM_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("PRFM PLDL1KEEP, [%0]" : : "r" (p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("PRFM PLDL2KEEP, [%0]" : : "r" (p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("PRFM PLDL3KEEP, [%0]" : : "r" (p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_ARM_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_rwlock.h new file mode 100644 index 00000000..18bb37b0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_rwlock.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ +/* copied from ppc_64 */ + +#ifndef _RTE_RWLOCK_ARM_H_ +#define _RTE_RWLOCK_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_rwlock.h" + +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_lock(rwl); +} + +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_unlock(rwl); +} + +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_lock(rwl); +} + +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_unlock(rwl); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_spinlock.h new file mode 100644 index 00000000..1a6916b6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_spinlock.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. + */ + +#ifndef _RTE_SPINLOCK_ARM_H_ +#define _RTE_SPINLOCK_ARM_H_ + +#ifndef RTE_FORCE_INTRINSICS +# error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_spinlock.h" + +static inline int rte_tm_supported(void) +{ + return 0; +} + +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_lock(sl); /* fall-back */ +} + +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl) +{ + return rte_spinlock_trylock(sl); +} + +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_unlock(sl); +} + +static inline void +rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_lock(slr); /* fall-back */ +} + +static inline void +rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_unlock(slr); +} + +static inline int +rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr) +{ + return rte_spinlock_recursive_trylock(slr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SPINLOCK_ARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_vect.h new file mode 100644 index 00000000..2a18a685 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_vect.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_VECT_ARM_H_ +#define _RTE_VECT_ARM_H_ + +#include +#include "generic/rte_vect.h" +#include "rte_debug.h" +#include "arm_neon.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int32x4_t xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} __attribute__((aligned(16))) rte_xmm_t; + +#ifdef RTE_ARCH_ARM +/* NEON intrinsic vqtbl1q_u8() is not supported in ARMv7-A(AArch32) */ +static __inline uint8x16_t +vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + uint8_t i, pos; + rte_xmm_t rte_a, rte_b, rte_ret; + + vst1q_u8(rte_a.u8, a); + vst1q_u8(rte_b.u8, b); + + for (i = 0; i < 16; i++) { + pos = rte_b.u8[i]; + if (pos < 16) + rte_ret.u8[i] = rte_a.u8[pos]; + else + rte_ret.u8[i] = 0; + } + + return vld1q_u8(rte_ret.u8); +} + +static inline uint16_t +vaddvq_u16(uint16x8_t a) +{ + uint32x4_t m = vpaddlq_u16(a); + uint64x2_t n = vpaddlq_u32(m); + uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); + + return vget_lane_u32((uint32x2_t)o, 0); +} + +#endif + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000) +static inline uint32x4_t +vcopyq_laneq_u32(uint32x4_t a, const int lane_a, + uint32x4_t b, const int lane_b) +{ + return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a); +} +#endif + +#if defined(RTE_ARCH_ARM64) +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000) + +#if (GCC_VERSION < 40900) +typedef uint64_t poly64_t; +typedef uint64x2_t poly64x2_t; +typedef uint8_t poly128_t __attribute__((vector_size(16), aligned(16))); + +static inline uint32x4_t +vceqzq_u32(uint32x4_t a) +{ + return (a == 0); +} +#endif + +/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */ +static inline uint64x2_t +vreinterpretq_u64_p128(poly128_t x) +{ + return (uint64x2_t)x; +} + +/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */ +static inline poly64x2_t +vreinterpretq_p64_u64(uint64x2_t x) +{ + return (poly64x2_t)x; +} + +/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */ +static inline poly64_t +vgetq_lane_p64(poly64x2_t x, const int lane) +{ + RTE_ASSERT(lane >= 0 && lane <= 1); + + poly64_t *p = (poly64_t *)&x; + + return p[lane]; +} +#endif +#endif + +/* + * If (0 <= index <= 15), then call the ASIMD ext instruction on the + * 128 bit regs v0 and v1 with the appropriate index. + * + * Else returns a zero vector. + */ +static inline uint8x16_t +vextract(uint8x16_t v0, uint8x16_t v1, const int index) +{ + switch (index) { + case 0: return vextq_u8(v0, v1, 0); + case 1: return vextq_u8(v0, v1, 1); + case 2: return vextq_u8(v0, v1, 2); + case 3: return vextq_u8(v0, v1, 3); + case 4: return vextq_u8(v0, v1, 4); + case 5: return vextq_u8(v0, v1, 5); + case 6: return vextq_u8(v0, v1, 6); + case 7: return vextq_u8(v0, v1, 7); + case 8: return vextq_u8(v0, v1, 8); + case 9: return vextq_u8(v0, v1, 9); + case 10: return vextq_u8(v0, v1, 10); + case 11: return vextq_u8(v0, v1, 11); + case 12: return vextq_u8(v0, v1, 12); + case 13: return vextq_u8(v0, v1, 13); + case 14: return vextq_u8(v0, v1, 14); + case 15: return vextq_u8(v0, v1, 15); + } + return vdupq_n_u8(0); +} + +/** + * Shifts right 128 bit register by specified number of bytes + * + * Value of shift parameter must be in range 0 - 16 + */ +static inline uint64x2_t +vshift_bytes_right(uint64x2_t reg, const unsigned int shift) +{ + return vreinterpretq_u64_u8(vextract( + vreinterpretq_u8_u64(reg), + vdupq_n_u8(0), + shift)); +} + +/** + * Shifts left 128 bit register by specified number of bytes + * + * Value of shift parameter must be in range 0 - 16 + */ +static inline uint64x2_t +vshift_bytes_left(uint64x2_t reg, const unsigned int shift) +{ + return vreinterpretq_u64_u8(vextract( + vdupq_n_u8(0), + vreinterpretq_u8_u64(reg), + 16 - shift)); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h new file mode 100644 index 00000000..ce38350b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h @@ -0,0 +1,470 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * Inspired from FreeBSD src/sys/powerpc/include/atomic.h + * Copyright (c) 2008 Marcel Moolenaar + * Copyright (c) 2001 Benno Rice + * Copyright (c) 2001 David E. O'Brien + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + */ + +#ifndef _RTE_ATOMIC_PPC_64_H_ +#define _RTE_ATOMIC_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_atomic.h" + +/** + * General memory barrier. + * + * Guarantees that the LOAD and STORE operations generated before the + * barrier occur before the LOAD and STORE operations generated after. + */ +#define rte_mb() asm volatile("sync" : : : "memory") + +/** + * Write memory barrier. + * + * Guarantees that the STORE operations generated before the barrier + * occur before the STORE operations generated after. + */ +#ifdef RTE_ARCH_64 +#define rte_wmb() asm volatile("lwsync" : : : "memory") +#else +#define rte_wmb() asm volatile("sync" : : : "memory") +#endif + +/** + * Read memory barrier. + * + * Guarantees that the LOAD operations generated before the barrier + * occur before the LOAD operations generated after. + */ +#ifdef RTE_ARCH_64 +#define rte_rmb() asm volatile("lwsync" : : : "memory") +#else +#define rte_rmb() asm volatile("sync" : : : "memory") +#endif + +#define rte_smp_mb() rte_mb() + +#define rte_smp_wmb() rte_wmb() + +#define rte_smp_rmb() rte_rmb() + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_wmb() + +#define rte_io_rmb() rte_rmb() + +#define rte_cio_wmb() rte_wmb() + +#define rte_cio_rmb() rte_rmb() + +/*------------------------- 16 bit atomic operations -------------------------*/ +/* To be compatible with Power7, use GCC built-in functions for 16 bit + * operations */ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) +{ + return __atomic_compare_exchange(dst, &exp, &src, 0, __ATOMIC_ACQUIRE, + __ATOMIC_ACQUIRE) ? 1 : 0; +} + +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) +{ + return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic16_inc(rte_atomic16_t *v) +{ + __atomic_add_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE); +} + +static inline void +rte_atomic16_dec(rte_atomic16_t *v) +{ + __atomic_sub_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE); +} + +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) +{ + return __atomic_add_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE) == 0; +} + +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) +{ + return __atomic_sub_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE) == 0; +} + +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ + return __atomic_exchange_2(dst, val, __ATOMIC_SEQ_CST); +} + +/*------------------------- 32 bit atomic operations -------------------------*/ + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + unsigned int ret = 0; + + asm volatile( + "\tlwsync\n" + "1:\tlwarx %[ret], 0, %[dst]\n" + "cmplw %[exp], %[ret]\n" + "bne 2f\n" + "stwcx. %[src], 0, %[dst]\n" + "bne- 1b\n" + "li %[ret], 1\n" + "b 3f\n" + "2:\n" + "stwcx. %[ret], 0, %[dst]\n" + "li %[ret], 0\n" + "3:\n" + "isync\n" + : [ret] "=&r" (ret), "=m" (*dst) + : [dst] "r" (dst), + [exp] "r" (exp), + [src] "r" (src), + "m" (*dst) + : "cc", "memory"); + + return ret; +} + +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) +{ + return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic32_inc(rte_atomic32_t *v) +{ + int t; + + asm volatile( + "1: lwarx %[t],0,%[cnt]\n" + "addic %[t],%[t],1\n" + "stwcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "=m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline void +rte_atomic32_dec(rte_atomic32_t *v) +{ + int t; + + asm volatile( + "1: lwarx %[t],0,%[cnt]\n" + "addic %[t],%[t],-1\n" + "stwcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "=m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) +{ + int ret; + + asm volatile( + "\n\tlwsync\n" + "1: lwarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],1\n" + "stwcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) +{ + int ret; + + asm volatile( + "\n\tlwsync\n" + "1: lwarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],-1\n" + "stwcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +} + +/*------------------------- 64 bit atomic operations -------------------------*/ + +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + unsigned int ret = 0; + + asm volatile ( + "\tlwsync\n" + "1: ldarx %[ret], 0, %[dst]\n" + "cmpld %[exp], %[ret]\n" + "bne 2f\n" + "stdcx. %[src], 0, %[dst]\n" + "bne- 1b\n" + "li %[ret], 1\n" + "b 3f\n" + "2:\n" + "stdcx. %[ret], 0, %[dst]\n" + "li %[ret], 0\n" + "3:\n" + "isync\n" + : [ret] "=&r" (ret), "=m" (*dst) + : [dst] "r" (dst), + [exp] "r" (exp), + [src] "r" (src), + "m" (*dst) + : "cc", "memory"); + return ret; +} + +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ + v->cnt = 0; +} + +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ + long ret; + + asm volatile("ld%U1%X1 %[ret],%[cnt]" + : [ret] "=r"(ret) + : [cnt] "m"(v->cnt)); + + return ret; +} + +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ + asm volatile("std%U0%X0 %[new_value],%[cnt]" + : [cnt] "=m"(v->cnt) + : [new_value] "r"(new_value)); +} + +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "add %[t],%[inc],%[t]\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "=m" (v->cnt) + : [cnt] "r" (&v->cnt), [inc] "r" (inc), "m" (v->cnt) + : "cc", "memory"); +} + +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "subf %[t],%[dec],%[t]\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "+m" (v->cnt) + : [cnt] "r" (&v->cnt), [dec] "r" (dec), "m" (v->cnt) + : "cc", "memory"); +} + +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "addic %[t],%[t],1\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "+m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + long t; + + asm volatile( + "1: ldarx %[t],0,%[cnt]\n" + "addic %[t],%[t],-1\n" + "stdcx. %[t],0,%[cnt]\n" + "bne- 1b\n" + : [t] "=&r" (t), "+m" (v->cnt) + : [cnt] "r" (&v->cnt), "m" (v->cnt) + : "cc", "xer", "memory"); +} + +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "add %[ret],%[inc],%[ret]\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [inc] "r" (inc), [cnt] "r" (&v->cnt) + : "cc", "memory"); + + return ret; +} + +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "subf %[ret],%[dec],%[ret]\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [dec] "r" (dec), [cnt] "r" (&v->cnt) + : "cc", "memory"); + + return ret; +} + +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],1\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + long ret; + + asm volatile( + "\n\tlwsync\n" + "1: ldarx %[ret],0,%[cnt]\n" + "addic %[ret],%[ret],-1\n" + "stdcx. %[ret],0,%[cnt]\n" + "bne- 1b\n" + "isync\n" + : [ret] "=&r" (ret) + : [cnt] "r" (&v->cnt) + : "cc", "xer", "memory"); + + return ret == 0; +} + +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} +/** + * Atomically set a 64-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + v->cnt = 0; +} + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_byteorder.h new file mode 100644 index 00000000..544de3c2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_byteorder.h @@ -0,0 +1,150 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Inspired from FreeBSD src/sys/powerpc/include/endian.h + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. +*/ + +#ifndef _RTE_BYTEORDER_PPC_64_H_ +#define _RTE_BYTEORDER_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_byteorder.h" + +/* + * An architecture-optimized byte swap for a 16-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap16(). + */ +static inline uint16_t rte_arch_bswap16(uint16_t _x) +{ + return (_x >> 8) | ((_x << 8) & 0xff00); +} + +/* + * An architecture-optimized byte swap for a 32-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap32(). + */ +static inline uint32_t rte_arch_bswap32(uint32_t _x) +{ + return (_x >> 24) | ((_x >> 8) & 0xff00) | ((_x << 8) & 0xff0000) | + ((_x << 24) & 0xff000000); +} + +/* + * An architecture-optimized byte swap for a 64-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap64(). + */ +/* 64-bit mode */ +static inline uint64_t rte_arch_bswap64(uint64_t _x) +{ + return (_x >> 56) | ((_x >> 40) & 0xff00) | ((_x >> 24) & 0xff0000) | + ((_x >> 8) & 0xff000000) | ((_x << 8) & (0xffULL << 32)) | + ((_x << 24) & (0xffULL << 40)) | + ((_x << 40) & (0xffULL << 48)) | ((_x << 56)); +} + +#ifndef RTE_FORCE_INTRINSICS +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) + +#define rte_bswap32(x) ((uint32_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap32(x) : \ + rte_arch_bswap32(x))) + +#define rte_bswap64(x) ((uint64_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap64(x) : \ + rte_arch_bswap64(x))) +#else +/* + * __builtin_bswap16 is only available gcc 4.8 and upwards + */ +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) +#endif +#endif + +/* Power 8 have both little endian and big endian mode + * Power 7 only support big endian + */ +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + +#define rte_cpu_to_le_16(x) (x) +#define rte_cpu_to_le_32(x) (x) +#define rte_cpu_to_le_64(x) (x) + +#define rte_cpu_to_be_16(x) rte_bswap16(x) +#define rte_cpu_to_be_32(x) rte_bswap32(x) +#define rte_cpu_to_be_64(x) rte_bswap64(x) + +#define rte_le_to_cpu_16(x) (x) +#define rte_le_to_cpu_32(x) (x) +#define rte_le_to_cpu_64(x) (x) + +#define rte_be_to_cpu_16(x) rte_bswap16(x) +#define rte_be_to_cpu_32(x) rte_bswap32(x) +#define rte_be_to_cpu_64(x) rte_bswap64(x) + +#else /* RTE_BIG_ENDIAN */ + +#define rte_cpu_to_le_16(x) rte_bswap16(x) +#define rte_cpu_to_le_32(x) rte_bswap32(x) +#define rte_cpu_to_le_64(x) rte_bswap64(x) + +#define rte_cpu_to_be_16(x) (x) +#define rte_cpu_to_be_32(x) (x) +#define rte_cpu_to_be_64(x) (x) + +#define rte_le_to_cpu_16(x) rte_bswap16(x) +#define rte_le_to_cpu_32(x) rte_bswap32(x) +#define rte_le_to_cpu_64(x) rte_bswap64(x) + +#define rte_be_to_cpu_16(x) (x) +#define rte_be_to_cpu_32(x) (x) +#define rte_be_to_cpu_64(x) (x) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BYTEORDER_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cpuflags.h new file mode 100644 index 00000000..7cc2b3c5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cpuflags.h @@ -0,0 +1,88 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_CPUFLAGS_PPC_64_H_ +#define _RTE_CPUFLAGS_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Enumeration of all CPU features supported + */ +enum rte_cpu_flag_t { + RTE_CPUFLAG_PPC_LE = 0, + RTE_CPUFLAG_TRUE_LE, + RTE_CPUFLAG_PSERIES_PERFMON_COMPAT, + RTE_CPUFLAG_VSX, + RTE_CPUFLAG_ARCH_2_06, + RTE_CPUFLAG_POWER6_EXT, + RTE_CPUFLAG_DFP, + RTE_CPUFLAG_PA6T, + RTE_CPUFLAG_ARCH_2_05, + RTE_CPUFLAG_ICACHE_SNOOP, + RTE_CPUFLAG_SMT, + RTE_CPUFLAG_BOOKE, + RTE_CPUFLAG_CELLBE, + RTE_CPUFLAG_POWER5_PLUS, + RTE_CPUFLAG_POWER5, + RTE_CPUFLAG_POWER4, + RTE_CPUFLAG_NOTB, + RTE_CPUFLAG_EFP_DOUBLE, + RTE_CPUFLAG_EFP_SINGLE, + RTE_CPUFLAG_SPE, + RTE_CPUFLAG_UNIFIED_CACHE, + RTE_CPUFLAG_4xxMAC, + RTE_CPUFLAG_MMU, + RTE_CPUFLAG_FPU, + RTE_CPUFLAG_ALTIVEC, + RTE_CPUFLAG_PPC601, + RTE_CPUFLAG_PPC64, + RTE_CPUFLAG_PPC32, + RTE_CPUFLAG_TAR, + RTE_CPUFLAG_LSEL, + RTE_CPUFLAG_EBB, + RTE_CPUFLAG_DSCR, + RTE_CPUFLAG_HTM, + RTE_CPUFLAG_ARCH_2_07, + /* The last item */ + RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cycles.h new file mode 100644 index 00000000..8fa6fc60 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_cycles.h @@ -0,0 +1,96 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_CYCLES_PPC_64_H_ +#define _RTE_CYCLES_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +#include +#include + +/** + * Read the time base register. + * + * @return + * The time base for this lcore. + */ +static inline uint64_t +rte_rdtsc(void) +{ + union { + uint64_t tsc_64; + RTE_STD_C11 + struct { +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + uint32_t hi_32; + uint32_t lo_32; +#else + uint32_t lo_32; + uint32_t hi_32; +#endif + }; + } tsc; + uint32_t tmp; + + asm volatile( + "0:\n" + "mftbu %[hi32]\n" + "mftb %[lo32]\n" + "mftbu %[tmp]\n" + "cmpw %[tmp],%[hi32]\n" + "bne 0b\n" + : [hi32] "=r"(tsc.hi_32), [lo32] "=r"(tsc.lo_32), + [tmp] "=r"(tmp) + ); + return tsc.tsc_64; +} + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_io.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_io.h new file mode 100644 index 00000000..01455065 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_io.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_PPC_64_H_ +#define _RTE_IO_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_io.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h new file mode 100644 index 00000000..75f74897 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h @@ -0,0 +1,226 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_MEMCPY_PPC_64_H_ +#define _RTE_MEMCPY_PPC_64_H_ + +#include +#include +/*To include altivec.h, GCC version must >= 4.8 */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_memcpy.h" + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); + vec_vsx_st(vec_vsx_ld(64, src), 64, dst); + vec_vsx_st(vec_vsx_ld(80, src), 80, dst); + vec_vsx_st(vec_vsx_ld(96, src), 96, dst); + vec_vsx_st(vec_vsx_ld(112, src), 112, dst); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov128(dst, src); + rte_mov128(dst + 128, src + 128); +} + +#define rte_memcpy(dst, src, n) \ + __extension__ ({ \ + (__builtin_constant_p(n)) ? \ + memcpy((dst), (src), (n)) : \ + rte_memcpy_func((dst), (src), (n)); }) + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* We can't copy < 16 bytes using XMM registers so do it manually. */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + dst = (uint8_t *)dst + 1; + src = (const uint8_t *)src + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst = (uint16_t *)dst + 1; + src = (const uint16_t *)src + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + return ret; + } + + /* Special fast cases for <= 128 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + + if (n <= 128) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /* + * For large copies > 128 bytes. This combination of 256, 64 and 16 byte + * copies was found to be faster than doing 128 and 32 byte copies as + * well. + */ + for ( ; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /* + * We split the remaining bytes (which will be less than 256) into + * 64byte (2^6) chunks. + * Using incrementing integers in the case labels of a switch statement + * encourages the compiler to use a jump table. To get incrementing + * integers, we shift the 2 relevant bits to the LSB position to first + * get decrementing integers, and then subtract. + */ + switch (3 - (n >> 6)) { + case 0x00: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x01: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x02: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + default: + ; + } + + /* + * We split the remaining bytes (which will be less than 64) into + * 16byte (2^4) chunks, using the same switch structure as above. + */ + switch (3 - (n >> 4)) { + case 0x00: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x01: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x02: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + default: + ; + } + + /* Copy any remaining bytes, without going beyond end of buffers */ + if (n != 0) + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h new file mode 100644 index 00000000..8bd83576 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_PPC64_H_ +#define _RTE_PAUSE_PPC64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_pause.h" + +static inline void rte_pause(void) +{ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_PPC64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_prefetch.h new file mode 100644 index 00000000..fd2e53b9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_prefetch.h @@ -0,0 +1,68 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_PREFETCH_PPC_64_H_ +#define _RTE_PREFETCH_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("dcbt 0,%[p],0" : : [p] "r" (p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("dcbt 0,%[p],0" : : [p] "r" (p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("dcbt 0,%[p],0" : : [p] "r" (p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + /* non-temporal version not available, fallback to rte_prefetch0 */ + rte_prefetch0(p); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h new file mode 100644 index 00000000..9fadc040 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ +#ifndef _RTE_RWLOCK_PPC_64_H_ +#define _RTE_RWLOCK_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_rwlock.h" + +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_lock(rwl); +} + +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_read_unlock(rwl); +} + +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_lock(rwl); +} + +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl) +{ + rte_rwlock_write_unlock(rwl); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_spinlock.h new file mode 100644 index 00000000..39815d9e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_spinlock.h @@ -0,0 +1,115 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_SPINLOCK_PPC_64_H_ +#define _RTE_SPINLOCK_PPC_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include "generic/rte_spinlock.h" + +/* Fixme: Use intrinsics to implement the spinlock on Power architecture */ + +#ifndef RTE_FORCE_INTRINSICS + +static inline void +rte_spinlock_lock(rte_spinlock_t *sl) +{ + while (__sync_lock_test_and_set(&sl->locked, 1)) + while (sl->locked) + rte_pause(); +} + +static inline void +rte_spinlock_unlock(rte_spinlock_t *sl) +{ + __sync_lock_release(&sl->locked); +} + +static inline int +rte_spinlock_trylock(rte_spinlock_t *sl) +{ + return __sync_lock_test_and_set(&sl->locked, 1) == 0; +} + +#endif + +static inline int rte_tm_supported(void) +{ + return 0; +} + +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_lock(sl); /* fall-back */ +} + +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl) +{ + return rte_spinlock_trylock(sl); +} + +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl) +{ + rte_spinlock_unlock(sl); +} + +static inline void +rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_lock(slr); /* fall-back */ +} + +static inline void +rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_recursive_unlock(slr); +} + +static inline int +rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr) +{ + return rte_spinlock_recursive_trylock(slr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SPINLOCK_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h new file mode 100644 index 00000000..99586e58 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h @@ -0,0 +1,61 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_VECT_PPC_64_H_ +#define _RTE_VECT_PPC_64_H_ + +#include +#include "generic/rte_vect.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef vector signed int xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} __attribute__((aligned(16))) rte_xmm_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VECT_PPC_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/meson.build b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/meson.build new file mode 100644 index 00000000..bc8ffea1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/meson.build @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +install_headers( + 'rte_atomic_32.h', + 'rte_atomic_64.h', + 'rte_atomic.h', + 'rte_byteorder_32.h', + 'rte_byteorder_64.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', + 'rte_cycles.h', + 'rte_io.h', + 'rte_memcpy.h', + 'rte_prefetch.h', + 'rte_pause.h', + 'rte_rtm.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', + subdir: get_option('include_subdir_arch')) diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h new file mode 100644 index 00000000..148398f5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ATOMIC_X86_H_ +#define _RTE_ATOMIC_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include "generic/rte_atomic.h" + +#if RTE_MAX_LCORE == 1 +#define MPLOCKED /**< No need to insert MP lock prefix. */ +#else +#define MPLOCKED "lock ; " /**< Insert MP lock prefix. */ +#endif + +#define rte_mb() _mm_mfence() + +#define rte_wmb() _mm_sfence() + +#define rte_rmb() _mm_lfence() + +#define rte_smp_wmb() rte_compiler_barrier() + +#define rte_smp_rmb() rte_compiler_barrier() + +/* + * From Intel Software Development Manual; Vol 3; + * 8.2.2 Memory Ordering in P6 and More Recent Processor Families: + * ... + * . Reads are not reordered with other reads. + * . Writes are not reordered with older reads. + * . Writes to memory are not reordered with other writes, + * with the following exceptions: + * . streaming stores (writes) executed with the non-temporal move + * instructions (MOVNTI, MOVNTQ, MOVNTDQ, MOVNTPS, and MOVNTPD); and + * . string operations (see Section 8.2.4.1). + * ... + * . Reads may be reordered with older writes to different locations but not + * with older writes to the same location. + * . Reads or writes cannot be reordered with I/O instructions, + * locked instructions, or serializing instructions. + * . Reads cannot pass earlier LFENCE and MFENCE instructions. + * . Writes ... cannot pass earlier LFENCE, SFENCE, and MFENCE instructions. + * . LFENCE instructions cannot pass earlier reads. + * . SFENCE instructions cannot pass earlier writes ... + * . MFENCE instructions cannot pass earlier reads, writes ... + * + * As pointed by Java guys, that makes possible to use lock-prefixed + * instructions to get the same effect as mfence and on most modern HW + * that gives a better perfomance then using mfence: + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + * Basic idea is to use lock prefixed add with some dummy memory location + * as the destination. From their experiments 128B(2 cache lines) below + * current stack pointer looks like a good candidate. + * So below we use that techinque for rte_smp_mb() implementation. + */ + +static __rte_always_inline void +rte_smp_mb(void) +{ +#ifdef RTE_ARCH_I686 + asm volatile("lock addl $0, -128(%%esp); " ::: "memory"); +#else + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); +#endif +} + +#define rte_io_mb() rte_mb() + +#define rte_io_wmb() rte_compiler_barrier() + +#define rte_io_rmb() rte_compiler_barrier() + +#define rte_cio_wmb() rte_compiler_barrier() + +#define rte_cio_rmb() rte_compiler_barrier() + +/*------------------------- 16 bit atomic operations -------------------------*/ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) +{ + uint8_t res; + + asm volatile( + MPLOCKED + "cmpxchgw %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ + asm volatile( + MPLOCKED + "xchgw %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) +{ + return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic16_inc(rte_atomic16_t *v) +{ + asm volatile( + MPLOCKED + "incw %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline void +rte_atomic16_dec(rte_atomic16_t *v) +{ + asm volatile( + MPLOCKED + "decw %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "incw %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) +{ + uint8_t ret; + + asm volatile(MPLOCKED + "decw %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +/*------------------------- 32 bit atomic operations -------------------------*/ + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + uint8_t res; + + asm volatile( + MPLOCKED + "cmpxchgl %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ + asm volatile( + MPLOCKED + "xchgl %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) +{ + return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); +} + +static inline void +rte_atomic32_inc(rte_atomic32_t *v) +{ + asm volatile( + MPLOCKED + "incl %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline void +rte_atomic32_dec(rte_atomic32_t *v) +{ + asm volatile( + MPLOCKED + "decl %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "incl %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) +{ + uint8_t ret; + + asm volatile(MPLOCKED + "decl %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} +#endif + +#ifdef RTE_ARCH_I686 +#include "rte_atomic_32.h" +#else +#include "rte_atomic_64.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ATOMIC_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h new file mode 100644 index 00000000..a932f354 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h @@ -0,0 +1,243 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Inspired from FreeBSD src/sys/i386/include/atomic.h + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + */ + +#ifndef _RTE_ATOMIC_X86_H_ +#error do not include this file directly, use instead +#endif + +#ifndef _RTE_ATOMIC_I686_H_ +#define _RTE_ATOMIC_I686_H_ + +#include +#include +#include + +/*------------------------- 64 bit atomic operations -------------------------*/ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + uint8_t res; + RTE_STD_C11 + union { + struct { + uint32_t l32; + uint32_t h32; + }; + uint64_t u64; + } _exp, _src; + + _exp.u64 = exp; + _src.u64 = src; + +#ifndef __PIC__ + asm volatile ( + MPLOCKED + "cmpxchg8b (%[dst]);" + "setz %[res];" + : [res] "=a" (res) /* result in eax */ + : [dst] "S" (dst), /* esi */ + "b" (_src.l32), /* ebx */ + "c" (_src.h32), /* ecx */ + "a" (_exp.l32), /* eax */ + "d" (_exp.h32) /* edx */ + : "memory" ); /* no-clobber list */ +#else + asm volatile ( + "xchgl %%ebx, %%edi;\n" + MPLOCKED + "cmpxchg8b (%[dst]);" + "setz %[res];" + "xchgl %%ebx, %%edi;\n" + : [res] "=a" (res) /* result in eax */ + : [dst] "S" (dst), /* esi */ + "D" (_src.l32), /* ebx */ + "c" (_src.h32), /* ecx */ + "a" (_exp.l32), /* eax */ + "d" (_exp.h32) /* edx */ + : "memory" ); /* no-clobber list */ +#endif + + return res; +} + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dest, uint64_t val) +{ + uint64_t old; + + do { + old = *dest; + } while (rte_atomic64_cmpset(dest, old, val) == 0); + + return old; +} + +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, 0); + } +} + +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + /* replace the value by itself */ + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp); + } + return tmp; +} + +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, new_value); + } +} + +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp + inc); + } +} + +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp - dec); + } +} + +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + rte_atomic64_add(v, 1); +} + +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + rte_atomic64_sub(v, 1); +} + +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp + inc); + } + + return tmp + inc; +} + +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp - dec); + } + + return tmp - dec; +} + +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_add_return(v, 1) == 0; +} + +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_sub_return(v, 1) == 0; +} + +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} + +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + rte_atomic64_set(v, 0); +} +#endif + +#endif /* _RTE_ATOMIC_I686_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h new file mode 100644 index 00000000..fd2ec9c5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h @@ -0,0 +1,211 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Inspired from FreeBSD src/sys/amd64/include/atomic.h + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + */ + +#ifndef _RTE_ATOMIC_X86_H_ +#error do not include this file directly, use instead +#endif + +#ifndef _RTE_ATOMIC_X86_64_H_ +#define _RTE_ATOMIC_X86_64_H_ + +#include +#include +#include + +/*------------------------- 64 bit atomic operations -------------------------*/ + +#ifndef RTE_FORCE_INTRINSICS +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + uint8_t res; + + + asm volatile( + MPLOCKED + "cmpxchgq %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + + return res; +} + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ + asm volatile( + MPLOCKED + "xchgq %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ + v->cnt = 0; +} + +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ + return v->cnt; +} + +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ + v->cnt = new_value; +} + +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + asm volatile( + MPLOCKED + "addq %[inc], %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : [inc] "ir" (inc), /* input */ + "m" (v->cnt) + ); +} + +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + asm volatile( + MPLOCKED + "subq %[dec], %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : [dec] "ir" (dec), /* input */ + "m" (v->cnt) + ); +} + +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + asm volatile( + MPLOCKED + "incq %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + asm volatile( + MPLOCKED + "decq %[cnt]" + : [cnt] "=m" (v->cnt) /* output */ + : "m" (v->cnt) /* input */ + ); +} + +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + int64_t prev = inc; + + asm volatile( + MPLOCKED + "xaddq %[prev], %[cnt]" + : [prev] "+r" (prev), /* output */ + [cnt] "=m" (v->cnt) + : "m" (v->cnt) /* input */ + ); + return prev + inc; +} + +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + return rte_atomic64_add_return(v, -dec); +} + +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "incq %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + + return ret != 0; +} + +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + uint8_t ret; + + asm volatile( + MPLOCKED + "decq %[cnt] ; " + "sete %[ret]" + : [cnt] "+m" (v->cnt), /* output */ + [ret] "=qm" (ret) + ); + return ret != 0; +} + +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} + +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + v->cnt = 0; +} +#endif + +#endif /* _RTE_ATOMIC_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder.h new file mode 100644 index 00000000..a2dfecc1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_X86_H_ +#define _RTE_BYTEORDER_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include "generic/rte_byteorder.h" + +#ifndef RTE_BYTE_ORDER +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif + +/* + * An architecture-optimized byte swap for a 16-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap16(). + */ +static inline uint16_t rte_arch_bswap16(uint16_t _x) +{ + uint16_t x = _x; + asm volatile ("xchgb %b[x1],%h[x2]" + : [x1] "=Q" (x) + : [x2] "0" (x) + ); + return x; +} + +/* + * An architecture-optimized byte swap for a 32-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap32(). + */ +static inline uint32_t rte_arch_bswap32(uint32_t _x) +{ + uint32_t x = _x; + asm volatile ("bswap %[x]" + : [x] "+r" (x) + ); + return x; +} + +#ifndef RTE_FORCE_INTRINSICS +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) + +#define rte_bswap32(x) ((uint32_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap32(x) : \ + rte_arch_bswap32(x))) + +#define rte_bswap64(x) ((uint64_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap64(x) : \ + rte_arch_bswap64(x))) +#else +/* + * __builtin_bswap16 is only available gcc 4.8 and upwards + */ +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) +#define rte_bswap16(x) ((uint16_t)(__builtin_constant_p(x) ? \ + rte_constant_bswap16(x) : \ + rte_arch_bswap16(x))) +#endif +#endif + +#define rte_cpu_to_le_16(x) (x) +#define rte_cpu_to_le_32(x) (x) +#define rte_cpu_to_le_64(x) (x) + +#define rte_cpu_to_be_16(x) rte_bswap16(x) +#define rte_cpu_to_be_32(x) rte_bswap32(x) +#define rte_cpu_to_be_64(x) rte_bswap64(x) + +#define rte_le_to_cpu_16(x) (x) +#define rte_le_to_cpu_32(x) (x) +#define rte_le_to_cpu_64(x) (x) + +#define rte_be_to_cpu_16(x) rte_bswap16(x) +#define rte_be_to_cpu_32(x) rte_bswap32(x) +#define rte_be_to_cpu_64(x) rte_bswap64(x) + +#ifdef RTE_ARCH_I686 +#include "rte_byteorder_32.h" +#else +#include "rte_byteorder_64.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BYTEORDER_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_32.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_32.h new file mode 100644 index 00000000..d5a768e5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_32.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_X86_H_ +#error do not include this file directly, use instead +#endif + +#ifndef _RTE_BYTEORDER_I686_H_ +#define _RTE_BYTEORDER_I686_H_ + +#include +#include + +/* + * An architecture-optimized byte swap for a 64-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap64(). + */ +/* Compat./Leg. mode */ +static inline uint64_t rte_arch_bswap64(uint64_t x) +{ + uint64_t ret = 0; + ret |= ((uint64_t)rte_arch_bswap32(x & 0xffffffffUL) << 32); + ret |= ((uint64_t)rte_arch_bswap32((x >> 32) & 0xffffffffUL)); + return ret; +} + +#endif /* _RTE_BYTEORDER_I686_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_64.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_64.h new file mode 100644 index 00000000..8c6cf285 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_byteorder_64.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_X86_H_ +#error do not include this file directly, use instead +#endif + +#ifndef _RTE_BYTEORDER_X86_64_H_ +#define _RTE_BYTEORDER_X86_64_H_ + +#include +#include + +/* + * An architecture-optimized byte swap for a 64-bit value. + * + * Do not use this function directly. The preferred function is rte_bswap64(). + */ +/* 64-bit mode */ +static inline uint64_t rte_arch_bswap64(uint64_t _x) +{ + uint64_t x = _x; + asm volatile ("bswap %[x]" + : [x] "+r" (x) + ); + return x; +} + +#endif /* _RTE_BYTEORDER_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h new file mode 100644 index 00000000..8315f6b6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cpuflags.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_CPUFLAGS_X86_64_H_ +#define _RTE_CPUFLAGS_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +enum rte_cpu_flag_t { + /* (EAX 01h) ECX features*/ + RTE_CPUFLAG_SSE3 = 0, /**< SSE3 */ + RTE_CPUFLAG_PCLMULQDQ, /**< PCLMULQDQ */ + RTE_CPUFLAG_DTES64, /**< DTES64 */ + RTE_CPUFLAG_MONITOR, /**< MONITOR */ + RTE_CPUFLAG_DS_CPL, /**< DS_CPL */ + RTE_CPUFLAG_VMX, /**< VMX */ + RTE_CPUFLAG_SMX, /**< SMX */ + RTE_CPUFLAG_EIST, /**< EIST */ + RTE_CPUFLAG_TM2, /**< TM2 */ + RTE_CPUFLAG_SSSE3, /**< SSSE3 */ + RTE_CPUFLAG_CNXT_ID, /**< CNXT_ID */ + RTE_CPUFLAG_FMA, /**< FMA */ + RTE_CPUFLAG_CMPXCHG16B, /**< CMPXCHG16B */ + RTE_CPUFLAG_XTPR, /**< XTPR */ + RTE_CPUFLAG_PDCM, /**< PDCM */ + RTE_CPUFLAG_PCID, /**< PCID */ + RTE_CPUFLAG_DCA, /**< DCA */ + RTE_CPUFLAG_SSE4_1, /**< SSE4_1 */ + RTE_CPUFLAG_SSE4_2, /**< SSE4_2 */ + RTE_CPUFLAG_X2APIC, /**< X2APIC */ + RTE_CPUFLAG_MOVBE, /**< MOVBE */ + RTE_CPUFLAG_POPCNT, /**< POPCNT */ + RTE_CPUFLAG_TSC_DEADLINE, /**< TSC_DEADLINE */ + RTE_CPUFLAG_AES, /**< AES */ + RTE_CPUFLAG_XSAVE, /**< XSAVE */ + RTE_CPUFLAG_OSXSAVE, /**< OSXSAVE */ + RTE_CPUFLAG_AVX, /**< AVX */ + RTE_CPUFLAG_F16C, /**< F16C */ + RTE_CPUFLAG_RDRAND, /**< RDRAND */ + RTE_CPUFLAG_HYPERVISOR, /**< Running in a VM */ + + /* (EAX 01h) EDX features */ + RTE_CPUFLAG_FPU, /**< FPU */ + RTE_CPUFLAG_VME, /**< VME */ + RTE_CPUFLAG_DE, /**< DE */ + RTE_CPUFLAG_PSE, /**< PSE */ + RTE_CPUFLAG_TSC, /**< TSC */ + RTE_CPUFLAG_MSR, /**< MSR */ + RTE_CPUFLAG_PAE, /**< PAE */ + RTE_CPUFLAG_MCE, /**< MCE */ + RTE_CPUFLAG_CX8, /**< CX8 */ + RTE_CPUFLAG_APIC, /**< APIC */ + RTE_CPUFLAG_SEP, /**< SEP */ + RTE_CPUFLAG_MTRR, /**< MTRR */ + RTE_CPUFLAG_PGE, /**< PGE */ + RTE_CPUFLAG_MCA, /**< MCA */ + RTE_CPUFLAG_CMOV, /**< CMOV */ + RTE_CPUFLAG_PAT, /**< PAT */ + RTE_CPUFLAG_PSE36, /**< PSE36 */ + RTE_CPUFLAG_PSN, /**< PSN */ + RTE_CPUFLAG_CLFSH, /**< CLFSH */ + RTE_CPUFLAG_DS, /**< DS */ + RTE_CPUFLAG_ACPI, /**< ACPI */ + RTE_CPUFLAG_MMX, /**< MMX */ + RTE_CPUFLAG_FXSR, /**< FXSR */ + RTE_CPUFLAG_SSE, /**< SSE */ + RTE_CPUFLAG_SSE2, /**< SSE2 */ + RTE_CPUFLAG_SS, /**< SS */ + RTE_CPUFLAG_HTT, /**< HTT */ + RTE_CPUFLAG_TM, /**< TM */ + RTE_CPUFLAG_PBE, /**< PBE */ + + /* (EAX 06h) EAX features */ + RTE_CPUFLAG_DIGTEMP, /**< DIGTEMP */ + RTE_CPUFLAG_TRBOBST, /**< TRBOBST */ + RTE_CPUFLAG_ARAT, /**< ARAT */ + RTE_CPUFLAG_PLN, /**< PLN */ + RTE_CPUFLAG_ECMD, /**< ECMD */ + RTE_CPUFLAG_PTM, /**< PTM */ + + /* (EAX 06h) ECX features */ + RTE_CPUFLAG_MPERF_APERF_MSR, /**< MPERF_APERF_MSR */ + RTE_CPUFLAG_ACNT2, /**< ACNT2 */ + RTE_CPUFLAG_ENERGY_EFF, /**< ENERGY_EFF */ + + /* (EAX 07h, ECX 0h) EBX features */ + RTE_CPUFLAG_FSGSBASE, /**< FSGSBASE */ + RTE_CPUFLAG_BMI1, /**< BMI1 */ + RTE_CPUFLAG_HLE, /**< Hardware Lock elision */ + RTE_CPUFLAG_AVX2, /**< AVX2 */ + RTE_CPUFLAG_SMEP, /**< SMEP */ + RTE_CPUFLAG_BMI2, /**< BMI2 */ + RTE_CPUFLAG_ERMS, /**< ERMS */ + RTE_CPUFLAG_INVPCID, /**< INVPCID */ + RTE_CPUFLAG_RTM, /**< Transactional memory */ + RTE_CPUFLAG_AVX512F, /**< AVX512F */ + + /* (EAX 80000001h) ECX features */ + RTE_CPUFLAG_LAHF_SAHF, /**< LAHF_SAHF */ + RTE_CPUFLAG_LZCNT, /**< LZCNT */ + + /* (EAX 80000001h) EDX features */ + RTE_CPUFLAG_SYSCALL, /**< SYSCALL */ + RTE_CPUFLAG_XD, /**< XD */ + RTE_CPUFLAG_1GB_PG, /**< 1GB_PG */ + RTE_CPUFLAG_RDTSCP, /**< RDTSCP */ + RTE_CPUFLAG_EM64T, /**< EM64T */ + + /* (EAX 80000007h) EDX features */ + RTE_CPUFLAG_INVTSC, /**< INVTSC */ + + /* The last item */ + RTE_CPUFLAG_NUMFLAGS, /**< This should always be the last! */ +}; + +#include "generic/rte_cpuflags.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CPUFLAGS_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cycles.h new file mode 100644 index 00000000..a461a4d7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_cycles.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#ifndef _RTE_CYCLES_X86_64_H_ +#define _RTE_CYCLES_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_cycles.h" + +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT +/* Global switch to use VMWARE mapping of TSC instead of RDTSC */ +extern int rte_cycles_vmware_tsc_map; +#include +#endif +#include +#include + +static inline uint64_t +rte_rdtsc(void) +{ + union { + uint64_t tsc_64; + RTE_STD_C11 + struct { + uint32_t lo_32; + uint32_t hi_32; + }; + } tsc; + +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + if (unlikely(rte_cycles_vmware_tsc_map)) { + /* ecx = 0x10000 corresponds to the physical TSC for VMware */ + asm volatile("rdpmc" : + "=a" (tsc.lo_32), + "=d" (tsc.hi_32) : + "c"(0x10000)); + return tsc.tsc_64; + } +#endif + + asm volatile("rdtsc" : + "=a" (tsc.lo_32), + "=d" (tsc.hi_32)); + return tsc.tsc_64; +} + +static inline uint64_t +rte_rdtsc_precise(void) +{ + rte_mb(); + return rte_rdtsc(); +} + +static inline uint64_t +rte_get_tsc_cycles(void) { return rte_rdtsc(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CYCLES_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_io.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_io.h new file mode 100644 index 00000000..2db71b1b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_io.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_X86_H_ +#define _RTE_IO_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_io.h" + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IO_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h new file mode 100644 index 00000000..7b758094 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -0,0 +1,876 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMCPY_X86_64_H_ +#define _RTE_MEMCPY_X86_64_H_ + +/** + * @file + * + * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). + */ + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Copy bytes from one location to another. The locations must not overlap. + * + * @note This is implemented as a macro, so it's address should not be taken + * and care is needed as parameter expressions may be evaluated multiple times. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + * @param n + * Number of bytes to copy. + * @return + * Pointer to the destination data. + */ +static __rte_always_inline void * +rte_memcpy(void *dst, const void *src, size_t n); + +#ifdef RTE_MACHINE_CPUFLAG_AVX512F + +#define ALIGNMENT_MASK 0x3F + +/** + * AVX512 implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __m256i ymm0; + + ymm0 = _mm256_loadu_si256((const __m256i *)src); + _mm256_storeu_si256((__m256i *)dst, ymm0); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + __m512i zmm0; + + zmm0 = _mm512_loadu_si512((const void *)src); + _mm512_storeu_si512((void *)dst, zmm0); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov64(dst + 0 * 64, src + 0 * 64); + rte_mov64(dst + 1 * 64, src + 1 * 64); +} + +/** + * Copy 256 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov64(dst + 0 * 64, src + 0 * 64); + rte_mov64(dst + 1 * 64, src + 1 * 64); + rte_mov64(dst + 2 * 64, src + 2 * 64); + rte_mov64(dst + 3 * 64, src + 3 * 64); +} + +/** + * Copy 128-byte blocks from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m512i zmm0, zmm1; + + while (n >= 128) { + zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); + n -= 128; + zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); + src = src + 128; + _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); + _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); + dst = dst + 128; + } +} + +/** + * Copy 512-byte blocks from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; + + while (n >= 512) { + zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); + n -= 512; + zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); + zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); + zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); + zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); + zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); + zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); + zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); + src = src + 512; + _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); + _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); + _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); + _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); + _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); + _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); + _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); + _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); + dst = dst + 512; + } +} + +static inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) +{ + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t bits; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) + *(uint64_t *)dstu = *(const uint64_t *)srcu; + return ret; + } + + /** + * Fast way when copy size doesn't exceed 512 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n <= 512) { + if (n >= 256) { + n -= 256; + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 256; + dst = (uint8_t *)dst + 256; + } + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK63: + if (n > 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + if (n > 0) + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /** + * Make store aligned when copy size exceeds 512 bytes + */ + dstofss = ((uintptr_t)dst & 0x3F); + if (dstofss > 0) { + dstofss = 64 - dstofss; + n -= dstofss; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + + /** + * Copy 512-byte blocks. + * Use copy block function for better instruction order control, + * which is important when load is unaligned. + */ + rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 511; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + + /** + * Copy 128-byte blocks. + * Use copy block function for better instruction order control, + * which is important when load is unaligned. + */ + if (n >= 128) { + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 127; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + } + + /** + * Copy whatever left + */ + goto COPY_BLOCK_128_BACK63; +} + +#elif defined RTE_MACHINE_CPUFLAG_AVX2 + +#define ALIGNMENT_MASK 0x1F + +/** + * AVX2 implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + __m256i ymm0; + + ymm0 = _mm256_loadu_si256((const __m256i *)src); + _mm256_storeu_si256((__m256i *)dst, ymm0); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); + rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); + rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); +} + +/** + * Copy 128-byte blocks from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) +{ + __m256i ymm0, ymm1, ymm2, ymm3; + + while (n >= 128) { + ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); + n -= 128; + ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); + ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); + ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); + src = (const uint8_t *)src + 128; + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); + _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); + dst = (uint8_t *)dst + 128; + } +} + +static inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) +{ + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t bits; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) { + *(uint64_t *)dstu = *(const uint64_t *)srcu; + } + return ret; + } + + /** + * Fast way when copy size doesn't exceed 256 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n <= 256) { + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK31: + if (n >= 64) { + n -= 64; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 64; + dst = (uint8_t *)dst + 64; + } + if (n > 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + if (n > 0) { + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + } + return ret; + } + + /** + * Make store aligned when copy size exceeds 256 bytes + */ + dstofss = (uintptr_t)dst & 0x1F; + if (dstofss > 0) { + dstofss = 32 - dstofss; + n -= dstofss; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + + /** + * Copy 128-byte blocks + */ + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); + bits = n; + n = n & 127; + bits -= n; + src = (const uint8_t *)src + bits; + dst = (uint8_t *)dst + bits; + + /** + * Copy whatever left + */ + goto COPY_BLOCK_128_BACK31; +} + +#else /* RTE_MACHINE_CPUFLAG */ + +#define ALIGNMENT_MASK 0x0F + +/** + * SSE & AVX implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); +} + +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +} + +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ +static __rte_always_inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); +} + +/** + * Copy 128 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); + rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); + rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); + rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); + rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); +} + +/** + * Copy 256 bytes from one location to another, + * locations should not overlap. + */ +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); + rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); + rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); + rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); + rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); + rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); + rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); + rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); + rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); + rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); + rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); + rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); + rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); +} + +/** + * Macro for copying unaligned block from one location to another with constant load offset, + * 47 bytes leftover maximum, + * locations should not overlap. + * Requirements: + * - Store is aligned + * - Load offset is , which must be immediate value within [1, 15] + * - For , make sure bit backwards & <16 - offset> bit forwards are available for loading + * - , , must be variables + * - __m128i ~ must be pre-defined + */ +#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ +__extension__ ({ \ + size_t tmp; \ + while (len >= 128 + 16 - offset) { \ + xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ + len -= 128; \ + xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ + xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ + xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \ + xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \ + xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \ + xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \ + xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \ + xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \ + src = (const uint8_t *)src + 128; \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ + dst = (uint8_t *)dst + 128; \ + } \ + tmp = len; \ + len = ((len - 16 + offset) & 127) + 16 - offset; \ + tmp -= len; \ + src = (const uint8_t *)src + tmp; \ + dst = (uint8_t *)dst + tmp; \ + if (len >= 32 + 16 - offset) { \ + while (len >= 32 + 16 - offset) { \ + xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ + len -= 32; \ + xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ + xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ + src = (const uint8_t *)src + 32; \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ + _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ + dst = (uint8_t *)dst + 32; \ + } \ + tmp = len; \ + len = ((len - 16 + offset) & 31) + 16 - offset; \ + tmp -= len; \ + src = (const uint8_t *)src + tmp; \ + dst = (uint8_t *)dst + tmp; \ + } \ +}) + +/** + * Macro for copying unaligned block from one location to another, + * 47 bytes leftover maximum, + * locations should not overlap. + * Use switch here because the aligning instruction requires immediate value for shift count. + * Requirements: + * - Store is aligned + * - Load offset is , which must be within [1, 15] + * - For , make sure bit backwards & <16 - offset> bit forwards are available for loading + * - , , must be variables + * - __m128i ~ used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined + */ +#define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ +__extension__ ({ \ + switch (offset) { \ + case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ + case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ + case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ + case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ + case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ + case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ + case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ + case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ + case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ + case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ + case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ + case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ + case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ + case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ + case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ + default:; \ + } \ +}) + +static inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) +{ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; + void *ret = dst; + size_t dstofss; + size_t srcofs; + + /** + * Copy less than 16 bytes + */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); + } + if (n & 0x02) { + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); + } + if (n & 0x04) { + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); + } + if (n & 0x08) { + *(uint64_t *)dstu = *(const uint64_t *)srcu; + } + return ret; + } + + /** + * Fast way when copy size doesn't exceed 512 bytes + */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 128) { + goto COPY_BLOCK_128_BACK15; + } + if (n <= 512) { + if (n >= 256) { + n -= 256; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); + src = (const uint8_t *)src + 256; + dst = (uint8_t *)dst + 256; + } +COPY_BLOCK_255_BACK15: + if (n >= 128) { + n -= 128; + rte_mov128((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 128; + dst = (uint8_t *)dst + 128; + } +COPY_BLOCK_128_BACK15: + if (n >= 64) { + n -= 64; + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 64; + dst = (uint8_t *)dst + 64; + } +COPY_BLOCK_64_BACK15: + if (n >= 32) { + n -= 32; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + 32; + dst = (uint8_t *)dst + 32; + } + if (n > 16) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + return ret; + } + if (n > 0) { + rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + } + return ret; + } + + /** + * Make store aligned when copy size exceeds 512 bytes, + * and make sure the first 15 bytes are copied, because + * unaligned copy functions require up to 15 bytes + * backwards access. + */ + dstofss = (uintptr_t)dst & 0x0F; + if (dstofss > 0) { + dstofss = 16 - dstofss + 16; + n -= dstofss; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } + srcofs = ((uintptr_t)src & 0x0F); + + /** + * For aligned copy + */ + if (srcofs == 0) { + /** + * Copy 256-byte blocks + */ + for (; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /** + * Copy whatever left + */ + goto COPY_BLOCK_255_BACK15; + } + + /** + * For copy with unaligned load + */ + MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); + + /** + * Copy whatever left + */ + goto COPY_BLOCK_64_BACK15; +} + +#endif /* RTE_MACHINE_CPUFLAG */ + +static inline void * +rte_memcpy_aligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* Copy size <= 16 bytes */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + src = (const uint8_t *)src + 1; + dst = (uint8_t *)dst + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + + return ret; + } + + /* Copy 16 <= size <= 32 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + + return ret; + } + + /* Copy 32 < size <= 64 bytes */ + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + + return ret; + } + + /* Copy 64 bytes blocks */ + for (; n >= 64; n -= 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; + } + + /* Copy whatever left */ + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + + return ret; +} + +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) + return rte_memcpy_aligned(dst, src, n); + else + return rte_memcpy_generic(dst, src, n); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_pause.h new file mode 100644 index 00000000..b4cf1df1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_pause.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_X86_H_ +#define _RTE_PAUSE_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_pause.h" + +#include +static inline void rte_pause(void) +{ + _mm_pause(); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PAUSE_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_prefetch.h new file mode 100644 index 00000000..384c6b3e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_prefetch.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_PREFETCH_X86_64_H_ +#define _RTE_PREFETCH_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "generic/rte_prefetch.h" + +static inline void rte_prefetch0(const volatile void *p) +{ + asm volatile ("prefetcht0 %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +static inline void rte_prefetch1(const volatile void *p) +{ + asm volatile ("prefetcht1 %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +static inline void rte_prefetch2(const volatile void *p) +{ + asm volatile ("prefetcht2 %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +static inline void rte_prefetch_non_temporal(const volatile void *p) +{ + asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PREFETCH_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rtm.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rtm.h new file mode 100644 index 00000000..ab099952 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rtm.h @@ -0,0 +1,73 @@ +#ifndef _RTE_RTM_H_ +#define _RTE_RTM_H_ 1 + +/* + * Copyright (c) 2012,2013 Intel Corporation + * Author: Andi Kleen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that: (1) source code distributions + * retain the above copyright notice and this paragraph in its entirety, (2) + * distributions including binary code include the above copyright notice and + * this paragraph in its entirety in the documentation or other materials + * provided with the distribution + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +/* Official RTM intrinsics interface matching gcc/icc, but works + on older gcc compatible compilers and binutils. */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RTE_XBEGIN_STARTED (~0u) +#define RTE_XABORT_EXPLICIT (1 << 0) +#define RTE_XABORT_RETRY (1 << 1) +#define RTE_XABORT_CONFLICT (1 << 2) +#define RTE_XABORT_CAPACITY (1 << 3) +#define RTE_XABORT_DEBUG (1 << 4) +#define RTE_XABORT_NESTED (1 << 5) +#define RTE_XABORT_CODE(x) (((x) >> 24) & 0xff) + +static __attribute__((__always_inline__)) inline +unsigned int rte_xbegin(void) +{ + unsigned int ret = RTE_XBEGIN_STARTED; + + asm volatile(".byte 0xc7,0xf8 ; .long 0" : "+a" (ret) :: "memory"); + return ret; +} + +static __attribute__((__always_inline__)) inline +void rte_xend(void) +{ + asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory"); +} + +/* not an inline function to workaround a clang bug with -O0 */ +#define rte_xabort(status) do { \ + asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory"); \ +} while (0) + +static __attribute__((__always_inline__)) inline +int rte_xtest(void) +{ + unsigned char out; + + asm volatile(".byte 0x0f,0x01,0xd6 ; setnz %0" : + "=r" (out) :: "memory"); + return out; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RTM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rwlock.h new file mode 100644 index 00000000..eec4c712 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_rwlock.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#ifndef _RTE_RWLOCK_X86_64_H_ +#define _RTE_RWLOCK_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_rwlock.h" +#include "rte_spinlock.h" + +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl) +{ + if (likely(rte_try_tm(&rwl->cnt))) + return; + rte_rwlock_read_lock(rwl); +} + +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl) +{ + if (unlikely(rwl->cnt)) + rte_rwlock_read_unlock(rwl); + else + rte_xend(); +} + +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl) +{ + if (likely(rte_try_tm(&rwl->cnt))) + return; + rte_rwlock_write_lock(rwl); +} + +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl) +{ + if (unlikely(rwl->cnt)) + rte_rwlock_write_unlock(rwl); + else + rte_xend(); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_spinlock.h new file mode 100644 index 00000000..60321da0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_spinlock.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_SPINLOCK_X86_64_H_ +#define _RTE_SPINLOCK_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_spinlock.h" +#include "rte_rtm.h" +#include "rte_cpuflags.h" +#include "rte_branch_prediction.h" +#include "rte_common.h" +#include "rte_pause.h" + +#define RTE_RTM_MAX_RETRIES (10) +#define RTE_XABORT_LOCK_BUSY (0xff) + +#ifndef RTE_FORCE_INTRINSICS +static inline void +rte_spinlock_lock(rte_spinlock_t *sl) +{ + int lock_val = 1; + asm volatile ( + "1:\n" + "xchg %[locked], %[lv]\n" + "test %[lv], %[lv]\n" + "jz 3f\n" + "2:\n" + "pause\n" + "cmpl $0, %[locked]\n" + "jnz 2b\n" + "jmp 1b\n" + "3:\n" + : [locked] "=m" (sl->locked), [lv] "=q" (lock_val) + : "[lv]" (lock_val) + : "memory"); +} + +static inline void +rte_spinlock_unlock (rte_spinlock_t *sl) +{ + int unlock_val = 0; + asm volatile ( + "xchg %[locked], %[ulv]\n" + : [locked] "=m" (sl->locked), [ulv] "=q" (unlock_val) + : "[ulv]" (unlock_val) + : "memory"); +} + +static inline int +rte_spinlock_trylock (rte_spinlock_t *sl) +{ + int lockval = 1; + + asm volatile ( + "xchg %[locked], %[lockval]" + : [locked] "=m" (sl->locked), [lockval] "=q" (lockval) + : "[lockval]" (lockval) + : "memory"); + + return lockval == 0; +} +#endif + +extern uint8_t rte_rtm_supported; + +static inline int rte_tm_supported(void) +{ + return rte_rtm_supported; +} + +static inline int +rte_try_tm(volatile int *lock) +{ + int retries; + + if (!rte_rtm_supported) + return 0; + + retries = RTE_RTM_MAX_RETRIES; + + while (likely(retries--)) { + + unsigned int status = rte_xbegin(); + + if (likely(RTE_XBEGIN_STARTED == status)) { + if (unlikely(*lock)) + rte_xabort(RTE_XABORT_LOCK_BUSY); + else + return 1; + } + while (*lock) + rte_pause(); + + if ((status & RTE_XABORT_EXPLICIT) && + (RTE_XABORT_CODE(status) == RTE_XABORT_LOCK_BUSY)) + continue; + + if ((status & RTE_XABORT_RETRY) == 0) /* do not retry */ + break; + } + return 0; +} + +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl) +{ + if (likely(rte_try_tm(&sl->locked))) + return; + + rte_spinlock_lock(sl); /* fall-back */ +} + +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl) +{ + if (likely(rte_try_tm(&sl->locked))) + return 1; + + return rte_spinlock_trylock(sl); +} + +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl) +{ + if (unlikely(sl->locked)) + rte_spinlock_unlock(sl); + else + rte_xend(); +} + +static inline void +rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr) +{ + if (likely(rte_try_tm(&slr->sl.locked))) + return; + + rte_spinlock_recursive_lock(slr); /* fall-back */ +} + +static inline void +rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr) +{ + if (unlikely(slr->sl.locked)) + rte_spinlock_recursive_unlock(slr); + else + rte_xend(); +} + +static inline int +rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr) +{ + if (likely(rte_try_tm(&slr->sl.locked))) + return 1; + + return rte_spinlock_recursive_trylock(slr); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SPINLOCK_X86_64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_vect.h new file mode 100644 index 00000000..cf4e9db3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/arch/x86/rte_vect.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_VECT_X86_H_ +#define _RTE_VECT_X86_H_ + +/** + * @file + * + * RTE SSE/AVX related header. + */ + +#include +#include +#include "generic/rte_vect.h" + +#if (defined(__ICC) || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)) + +#include /* SSE4 */ + +#if defined(__AVX__) +#include +#endif + +#else + +#include + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef __m128i xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} rte_xmm_t; + +#ifdef __AVX__ + +typedef __m256i ymm_t; + +#define YMM_SIZE (sizeof(ymm_t)) +#define YMM_MASK (YMM_SIZE - 1) + +typedef union rte_ymm { + ymm_t y; + xmm_t x[YMM_SIZE / sizeof(xmm_t)]; + uint8_t u8[YMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[YMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[YMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[YMM_SIZE / sizeof(uint64_t)]; + double pd[YMM_SIZE / sizeof(double)]; +} rte_ymm_t; + +#endif /* __AVX__ */ + +#ifdef RTE_ARCH_I686 +#define _mm_cvtsi128_si64(a) \ +__extension__ ({ \ + rte_xmm_t m; \ + m.x = (a); \ + (m.u64[0]); \ +}) +#endif + +/* + * Prior to version 12.1 icc doesn't support _mm_set_epi64x. + */ +#if (defined(__ICC) && __ICC < 1210) +#define _mm_set_epi64x(a, b) \ +__extension__ ({ \ + rte_xmm_t m; \ + m.u64[0] = b; \ + m.u64[1] = a; \ + (m.x); \ +}) +#endif /* (defined(__ICC) && __ICC < 1210) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VECT_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_atomic.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_atomic.h new file mode 100644 index 00000000..b99ba468 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_atomic.h @@ -0,0 +1,1085 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ATOMIC_H_ +#define _RTE_ATOMIC_H_ + +/** + * @file + * Atomic Operations + * + * This file defines a generic API for atomic operations. + */ + +#include +#include + +#ifdef __DOXYGEN__ + +/** @name Memory Barrier + */ +///@{ +/** + * General memory barrier. + * + * Guarantees that the LOAD and STORE operations generated before the + * barrier occur before the LOAD and STORE operations generated after. + * This function is architecture dependent. + */ +static inline void rte_mb(void); + +/** + * Write memory barrier. + * + * Guarantees that the STORE operations generated before the barrier + * occur before the STORE operations generated after. + * This function is architecture dependent. + */ +static inline void rte_wmb(void); + +/** + * Read memory barrier. + * + * Guarantees that the LOAD operations generated before the barrier + * occur before the LOAD operations generated after. + * This function is architecture dependent. + */ +static inline void rte_rmb(void); +///@} + +/** @name SMP Memory Barrier + */ +///@{ +/** + * General memory barrier between lcores + * + * Guarantees that the LOAD and STORE operations that precede the + * rte_smp_mb() call are globally visible across the lcores + * before the LOAD and STORE operations that follows it. + */ +static inline void rte_smp_mb(void); + +/** + * Write memory barrier between lcores + * + * Guarantees that the STORE operations that precede the + * rte_smp_wmb() call are globally visible across the lcores + * before the STORE operations that follows it. + */ +static inline void rte_smp_wmb(void); + +/** + * Read memory barrier between lcores + * + * Guarantees that the LOAD operations that precede the + * rte_smp_rmb() call are globally visible across the lcores + * before the LOAD operations that follows it. + */ +static inline void rte_smp_rmb(void); +///@} + +/** @name I/O Memory Barrier + */ +///@{ +/** + * General memory barrier for I/O device + * + * Guarantees that the LOAD and STORE operations that precede the + * rte_io_mb() call are visible to I/O device or CPU before the + * LOAD and STORE operations that follow it. + */ +static inline void rte_io_mb(void); + +/** + * Write memory barrier for I/O device + * + * Guarantees that the STORE operations that precede the + * rte_io_wmb() call are visible to I/O device before the STORE + * operations that follow it. + */ +static inline void rte_io_wmb(void); + +/** + * Read memory barrier for IO device + * + * Guarantees that the LOAD operations on I/O device that precede the + * rte_io_rmb() call are visible to CPU before the LOAD + * operations that follow it. + */ +static inline void rte_io_rmb(void); +///@} + +/** @name Coherent I/O Memory Barrier + * + * Coherent I/O memory barrier is a lightweight version of I/O memory + * barriers which are system-wide data synchronization barriers. This + * is for only coherent memory domain between lcore and I/O device but + * it is same as the I/O memory barriers in most of architectures. + * However, some architecture provides even lighter barriers which are + * somewhere in between I/O memory barriers and SMP memory barriers. + * For example, in case of ARMv8, DMB(data memory barrier) instruction + * can have different shareability domains - inner-shareable and + * outer-shareable. And inner-shareable DMB fits for SMP memory + * barriers and outer-shareable DMB for coherent I/O memory barriers, + * which acts on coherent memory. + * + * In most cases, I/O memory barriers are safer but if operations are + * on coherent memory instead of incoherent MMIO region of a device, + * then coherent I/O memory barriers can be used and this could bring + * performance gain depending on architectures. + */ +///@{ +/** + * Write memory barrier for coherent memory between lcore and I/O device + * + * Guarantees that the STORE operations on coherent memory that + * precede the rte_cio_wmb() call are visible to I/O device before the + * STORE operations that follow it. + */ +static inline void rte_cio_wmb(void); + +/** + * Read memory barrier for coherent memory between lcore and I/O device + * + * Guarantees that the LOAD operations on coherent memory updated by + * I/O device that precede the rte_cio_rmb() call are visible to CPU + * before the LOAD operations that follow it. + */ +static inline void rte_cio_rmb(void); +///@} + +#endif /* __DOXYGEN__ */ + +/** + * Compiler barrier. + * + * Guarantees that operation reordering does not occur at compile time + * for operations directly before and after the barrier. + */ +#define rte_compiler_barrier() do { \ + asm volatile ("" : : : "memory"); \ +} while(0) + +/*------------------------- 16 bit atomic operations -------------------------*/ + +/** + * Atomic compare and set. + * + * (atomic) equivalent to: + * if (*dst == exp) + * *dst = src (all 16-bit words) + * + * @param dst + * The destination location into which the value will be written. + * @param exp + * The expected value. + * @param src + * The new value. + * @return + * Non-zero on success; 0 on failure. + */ +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) +{ + return __sync_bool_compare_and_swap(dst, exp, src); +} +#endif + +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ +#if defined(RTE_ARCH_ARM64) && defined(RTE_TOOLCHAIN_CLANG) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_2(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + +/** + * The atomic counter structure. + */ +typedef struct { + volatile int16_t cnt; /**< An internal counter value. */ +} rte_atomic16_t; + +/** + * Static initializer for an atomic counter. + */ +#define RTE_ATOMIC16_INIT(val) { (val) } + +/** + * Initialize an atomic counter. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic16_init(rte_atomic16_t *v) +{ + v->cnt = 0; +} + +/** + * Atomically read a 16-bit value from a counter. + * + * @param v + * A pointer to the atomic counter. + * @return + * The value of the counter. + */ +static inline int16_t +rte_atomic16_read(const rte_atomic16_t *v) +{ + return v->cnt; +} + +/** + * Atomically set a counter to a 16-bit value. + * + * @param v + * A pointer to the atomic counter. + * @param new_value + * The new value for the counter. + */ +static inline void +rte_atomic16_set(rte_atomic16_t *v, int16_t new_value) +{ + v->cnt = new_value; +} + +/** + * Atomically add a 16-bit value to an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + */ +static inline void +rte_atomic16_add(rte_atomic16_t *v, int16_t inc) +{ + __sync_fetch_and_add(&v->cnt, inc); +} + +/** + * Atomically subtract a 16-bit value from an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + */ +static inline void +rte_atomic16_sub(rte_atomic16_t *v, int16_t dec) +{ + __sync_fetch_and_sub(&v->cnt, dec); +} + +/** + * Atomically increment a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic16_inc(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic16_inc(rte_atomic16_t *v) +{ + rte_atomic16_add(v, 1); +} +#endif + +/** + * Atomically decrement a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic16_dec(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic16_dec(rte_atomic16_t *v) +{ + rte_atomic16_sub(v, 1); +} +#endif + +/** + * Atomically add a 16-bit value to a counter and return the result. + * + * Atomically adds the 16-bits value (inc) to the atomic counter (v) and + * returns the value of v after addition. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + * @return + * The value of v after the addition. + */ +static inline int16_t +rte_atomic16_add_return(rte_atomic16_t *v, int16_t inc) +{ + return __sync_add_and_fetch(&v->cnt, inc); +} + +/** + * Atomically subtract a 16-bit value from a counter and return + * the result. + * + * Atomically subtracts the 16-bit value (inc) from the atomic counter + * (v) and returns the value of v after the subtraction. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + * @return + * The value of v after the subtraction. + */ +static inline int16_t +rte_atomic16_sub_return(rte_atomic16_t *v, int16_t dec) +{ + return __sync_sub_and_fetch(&v->cnt, dec); +} + +/** + * Atomically increment a 16-bit counter by one and test. + * + * Atomically increments the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the increment operation is 0; false otherwise. + */ +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic16_inc_and_test(rte_atomic16_t *v) +{ + return __sync_add_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically decrement a 16-bit counter by one and test. + * + * Atomically decrements the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the decrement operation is 0; false otherwise. + */ +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) +{ + return __sync_sub_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically test and set a 16-bit atomic counter. + * + * If the counter value is already set, return 0 (failed). Otherwise, set + * the counter value to 1 and return 1 (success). + * + * @param v + * A pointer to the atomic counter. + * @return + * 0 if failed; else 1, success. + */ +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) +{ + return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); +} +#endif + +/** + * Atomically set a 16-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic16_clear(rte_atomic16_t *v) +{ + v->cnt = 0; +} + +/*------------------------- 32 bit atomic operations -------------------------*/ + +/** + * Atomic compare and set. + * + * (atomic) equivalent to: + * if (*dst == exp) + * *dst = src (all 32-bit words) + * + * @param dst + * The destination location into which the value will be written. + * @param exp + * The expected value. + * @param src + * The new value. + * @return + * Non-zero on success; 0 on failure. + */ +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + return __sync_bool_compare_and_swap(dst, exp, src); +} +#endif + +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ +#if defined(RTE_ARCH_ARM64) && defined(RTE_TOOLCHAIN_CLANG) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + +/** + * The atomic counter structure. + */ +typedef struct { + volatile int32_t cnt; /**< An internal counter value. */ +} rte_atomic32_t; + +/** + * Static initializer for an atomic counter. + */ +#define RTE_ATOMIC32_INIT(val) { (val) } + +/** + * Initialize an atomic counter. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic32_init(rte_atomic32_t *v) +{ + v->cnt = 0; +} + +/** + * Atomically read a 32-bit value from a counter. + * + * @param v + * A pointer to the atomic counter. + * @return + * The value of the counter. + */ +static inline int32_t +rte_atomic32_read(const rte_atomic32_t *v) +{ + return v->cnt; +} + +/** + * Atomically set a counter to a 32-bit value. + * + * @param v + * A pointer to the atomic counter. + * @param new_value + * The new value for the counter. + */ +static inline void +rte_atomic32_set(rte_atomic32_t *v, int32_t new_value) +{ + v->cnt = new_value; +} + +/** + * Atomically add a 32-bit value to an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + */ +static inline void +rte_atomic32_add(rte_atomic32_t *v, int32_t inc) +{ + __sync_fetch_and_add(&v->cnt, inc); +} + +/** + * Atomically subtract a 32-bit value from an atomic counter. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + */ +static inline void +rte_atomic32_sub(rte_atomic32_t *v, int32_t dec) +{ + __sync_fetch_and_sub(&v->cnt, dec); +} + +/** + * Atomically increment a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic32_inc(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic32_inc(rte_atomic32_t *v) +{ + rte_atomic32_add(v, 1); +} +#endif + +/** + * Atomically decrement a counter by one. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic32_dec(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic32_dec(rte_atomic32_t *v) +{ + rte_atomic32_sub(v,1); +} +#endif + +/** + * Atomically add a 32-bit value to a counter and return the result. + * + * Atomically adds the 32-bits value (inc) to the atomic counter (v) and + * returns the value of v after addition. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + * @return + * The value of v after the addition. + */ +static inline int32_t +rte_atomic32_add_return(rte_atomic32_t *v, int32_t inc) +{ + return __sync_add_and_fetch(&v->cnt, inc); +} + +/** + * Atomically subtract a 32-bit value from a counter and return + * the result. + * + * Atomically subtracts the 32-bit value (inc) from the atomic counter + * (v) and returns the value of v after the subtraction. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + * @return + * The value of v after the subtraction. + */ +static inline int32_t +rte_atomic32_sub_return(rte_atomic32_t *v, int32_t dec) +{ + return __sync_sub_and_fetch(&v->cnt, dec); +} + +/** + * Atomically increment a 32-bit counter by one and test. + * + * Atomically increments the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the increment operation is 0; false otherwise. + */ +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic32_inc_and_test(rte_atomic32_t *v) +{ + return __sync_add_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically decrement a 32-bit counter by one and test. + * + * Atomically decrements the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the decrement operation is 0; false otherwise. + */ +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) +{ + return __sync_sub_and_fetch(&v->cnt, 1) == 0; +} +#endif + +/** + * Atomically test and set a 32-bit atomic counter. + * + * If the counter value is already set, return 0 (failed). Otherwise, set + * the counter value to 1 and return 1 (success). + * + * @param v + * A pointer to the atomic counter. + * @return + * 0 if failed; else 1, success. + */ +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) +{ + return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); +} +#endif + +/** + * Atomically set a 32-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic32_clear(rte_atomic32_t *v) +{ + v->cnt = 0; +} + +/*------------------------- 64 bit atomic operations -------------------------*/ + +/** + * An atomic compare and set function used by the mutex functions. + * (atomic) equivalent to: + * if (*dst == exp) + * *dst = src (all 64-bit words) + * + * @param dst + * The destination into which the value will be written. + * @param exp + * The expected value. + * @param src + * The new value. + * @return + * Non-zero on success; 0 on failure. + */ +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) +{ + return __sync_bool_compare_and_swap(dst, exp, src); +} +#endif + +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ +#if defined(RTE_ARCH_ARM64) && defined(RTE_TOOLCHAIN_CLANG) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_8(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + +/** + * The atomic counter structure. + */ +typedef struct { + volatile int64_t cnt; /**< Internal counter value. */ +} rte_atomic64_t; + +/** + * Static initializer for an atomic counter. + */ +#define RTE_ATOMIC64_INIT(val) { (val) } + +/** + * Initialize the atomic counter. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic64_init(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_init(rte_atomic64_t *v) +{ +#ifdef __LP64__ + v->cnt = 0; +#else + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, 0); + } +#endif +} +#endif + +/** + * Atomically read a 64-bit counter. + * + * @param v + * A pointer to the atomic counter. + * @return + * The value of the counter. + */ +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int64_t +rte_atomic64_read(rte_atomic64_t *v) +{ +#ifdef __LP64__ + return v->cnt; +#else + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + /* replace the value by itself */ + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, tmp); + } + return tmp; +#endif +} +#endif + +/** + * Atomically set a 64-bit counter. + * + * @param v + * A pointer to the atomic counter. + * @param new_value + * The new value of the counter. + */ +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_set(rte_atomic64_t *v, int64_t new_value) +{ +#ifdef __LP64__ + v->cnt = new_value; +#else + int success = 0; + uint64_t tmp; + + while (success == 0) { + tmp = v->cnt; + success = rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, + tmp, new_value); + } +#endif +} +#endif + +/** + * Atomically add a 64-bit value to a counter. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + */ +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_add(rte_atomic64_t *v, int64_t inc) +{ + __sync_fetch_and_add(&v->cnt, inc); +} +#endif + +/** + * Atomically subtract a 64-bit value from a counter. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + */ +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_sub(rte_atomic64_t *v, int64_t dec) +{ + __sync_fetch_and_sub(&v->cnt, dec); +} +#endif + +/** + * Atomically increment a 64-bit counter by one and test. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic64_inc(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_inc(rte_atomic64_t *v) +{ + rte_atomic64_add(v, 1); +} +#endif + +/** + * Atomically decrement a 64-bit counter by one and test. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void +rte_atomic64_dec(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_atomic64_dec(rte_atomic64_t *v) +{ + rte_atomic64_sub(v, 1); +} +#endif + +/** + * Add a 64-bit value to an atomic counter and return the result. + * + * Atomically adds the 64-bit value (inc) to the atomic counter (v) and + * returns the value of v after the addition. + * + * @param v + * A pointer to the atomic counter. + * @param inc + * The value to be added to the counter. + * @return + * The value of v after the addition. + */ +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc); + +#ifdef RTE_FORCE_INTRINSICS +static inline int64_t +rte_atomic64_add_return(rte_atomic64_t *v, int64_t inc) +{ + return __sync_add_and_fetch(&v->cnt, inc); +} +#endif + +/** + * Subtract a 64-bit value from an atomic counter and return the result. + * + * Atomically subtracts the 64-bit value (dec) from the atomic counter (v) + * and returns the value of v after the subtraction. + * + * @param v + * A pointer to the atomic counter. + * @param dec + * The value to be subtracted from the counter. + * @return + * The value of v after the subtraction. + */ +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec); + +#ifdef RTE_FORCE_INTRINSICS +static inline int64_t +rte_atomic64_sub_return(rte_atomic64_t *v, int64_t dec) +{ + return __sync_sub_and_fetch(&v->cnt, dec); +} +#endif + +/** + * Atomically increment a 64-bit counter by one and test. + * + * Atomically increments the atomic counter (v) by one and returns + * true if the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after the addition is 0; false otherwise. + */ +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic64_inc_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_add_return(v, 1) == 0; +} +#endif + +/** + * Atomically decrement a 64-bit counter by one and test. + * + * Atomically decrements the atomic counter (v) by one and returns true if + * the result is 0, or false in all other cases. + * + * @param v + * A pointer to the atomic counter. + * @return + * True if the result after subtraction is 0; false otherwise. + */ +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic64_dec_and_test(rte_atomic64_t *v) +{ + return rte_atomic64_sub_return(v, 1) == 0; +} +#endif + +/** + * Atomically test and set a 64-bit atomic counter. + * + * If the counter value is already set, return 0 (failed). Otherwise, set + * the counter value to 1 and return 1 (success). + * + * @param v + * A pointer to the atomic counter. + * @return + * 0 if failed; else 1, success. + */ +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) +{ + return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); +} +#endif + +/** + * Atomically set a 64-bit counter to 0. + * + * @param v + * A pointer to the atomic counter. + */ +static inline void rte_atomic64_clear(rte_atomic64_t *v); + +#ifdef RTE_FORCE_INTRINSICS +static inline void rte_atomic64_clear(rte_atomic64_t *v) +{ + rte_atomic64_set(v, 0); +} +#endif + +#endif /* _RTE_ATOMIC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h new file mode 100644 index 00000000..7d9a1463 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_BYTEORDER_H_ +#define _RTE_BYTEORDER_H_ + +/** + * @file + * + * Byte Swap Operations + * + * This file defines a generic API for byte swap operations. Part of + * the implementation is architecture-specific. + */ + +#include +#ifdef RTE_EXEC_ENV_BSDAPP +#include +#else +#include +#endif + +#include +#include + +/* + * Compile-time endianness detection + */ +#define RTE_BIG_ENDIAN 1 +#define RTE_LITTLE_ENDIAN 2 +#if defined __BYTE_ORDER__ +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define RTE_BYTE_ORDER RTE_BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif /* __BYTE_ORDER__ */ +#elif defined __BYTE_ORDER +#if __BYTE_ORDER == __BIG_ENDIAN +#define RTE_BYTE_ORDER RTE_BIG_ENDIAN +#elif __BYTE_ORDER == __LITTLE_ENDIAN +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif /* __BYTE_ORDER */ +#elif defined __BIG_ENDIAN__ +#define RTE_BYTE_ORDER RTE_BIG_ENDIAN +#elif defined __LITTLE_ENDIAN__ +#define RTE_BYTE_ORDER RTE_LITTLE_ENDIAN +#endif +#if !defined(RTE_BYTE_ORDER) +#error Unknown endianness. +#endif + +#define RTE_STATIC_BSWAP16(v) \ + ((((uint16_t)(v) & UINT16_C(0x00ff)) << 8) | \ + (((uint16_t)(v) & UINT16_C(0xff00)) >> 8)) + +#define RTE_STATIC_BSWAP32(v) \ + ((((uint32_t)(v) & UINT32_C(0x000000ff)) << 24) | \ + (((uint32_t)(v) & UINT32_C(0x0000ff00)) << 8) | \ + (((uint32_t)(v) & UINT32_C(0x00ff0000)) >> 8) | \ + (((uint32_t)(v) & UINT32_C(0xff000000)) >> 24)) + +#define RTE_STATIC_BSWAP64(v) \ + ((((uint64_t)(v) & UINT64_C(0x00000000000000ff)) << 56) | \ + (((uint64_t)(v) & UINT64_C(0x000000000000ff00)) << 40) | \ + (((uint64_t)(v) & UINT64_C(0x0000000000ff0000)) << 24) | \ + (((uint64_t)(v) & UINT64_C(0x00000000ff000000)) << 8) | \ + (((uint64_t)(v) & UINT64_C(0x000000ff00000000)) >> 8) | \ + (((uint64_t)(v) & UINT64_C(0x0000ff0000000000)) >> 24) | \ + (((uint64_t)(v) & UINT64_C(0x00ff000000000000)) >> 40) | \ + (((uint64_t)(v) & UINT64_C(0xff00000000000000)) >> 56)) + +/* + * These macros are functionally similar to rte_cpu_to_(be|le)(16|32|64)(), + * they take values in host CPU order and return them converted to the + * intended endianness. + * + * They resolve at compilation time to integer constants which can safely be + * used with static initializers, since those cannot involve function calls. + * + * On the other hand, they are not as optimized as their rte_cpu_to_*() + * counterparts, therefore applications should refrain from using them on + * variable values, particularly inside performance-sensitive code. + */ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN +#define RTE_BE16(v) (rte_be16_t)(v) +#define RTE_BE32(v) (rte_be32_t)(v) +#define RTE_BE64(v) (rte_be64_t)(v) +#define RTE_LE16(v) (rte_le16_t)(RTE_STATIC_BSWAP16(v)) +#define RTE_LE32(v) (rte_le32_t)(RTE_STATIC_BSWAP32(v)) +#define RTE_LE64(v) (rte_le64_t)(RTE_STATIC_BSWAP64(v)) +#elif RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +#define RTE_BE16(v) (rte_be16_t)(RTE_STATIC_BSWAP16(v)) +#define RTE_BE32(v) (rte_be32_t)(RTE_STATIC_BSWAP32(v)) +#define RTE_BE64(v) (rte_be64_t)(RTE_STATIC_BSWAP64(v)) +#define RTE_LE16(v) (rte_be16_t)(v) +#define RTE_LE32(v) (rte_be32_t)(v) +#define RTE_LE64(v) (rte_be64_t)(v) +#else +#error Unsupported endianness. +#endif + +/* + * The following types should be used when handling values according to a + * specific byte ordering, which may differ from that of the host CPU. + * + * Libraries, public APIs and applications are encouraged to use them for + * documentation purposes. + */ +typedef uint16_t rte_be16_t; /**< 16-bit big-endian value. */ +typedef uint32_t rte_be32_t; /**< 32-bit big-endian value. */ +typedef uint64_t rte_be64_t; /**< 64-bit big-endian value. */ +typedef uint16_t rte_le16_t; /**< 16-bit little-endian value. */ +typedef uint32_t rte_le32_t; /**< 32-bit little-endian value. */ +typedef uint64_t rte_le64_t; /**< 64-bit little-endian value. */ + +/* + * An internal function to swap bytes in a 16-bit value. + * + * It is used by rte_bswap16() when the value is constant. Do not use + * this function directly; rte_bswap16() is preferred. + */ +static inline uint16_t +rte_constant_bswap16(uint16_t x) +{ + return (uint16_t)RTE_STATIC_BSWAP16(x); +} + +/* + * An internal function to swap bytes in a 32-bit value. + * + * It is used by rte_bswap32() when the value is constant. Do not use + * this function directly; rte_bswap32() is preferred. + */ +static inline uint32_t +rte_constant_bswap32(uint32_t x) +{ + return (uint32_t)RTE_STATIC_BSWAP32(x); +} + +/* + * An internal function to swap bytes of a 64-bit value. + * + * It is used by rte_bswap64() when the value is constant. Do not use + * this function directly; rte_bswap64() is preferred. + */ +static inline uint64_t +rte_constant_bswap64(uint64_t x) +{ + return (uint64_t)RTE_STATIC_BSWAP64(x); +} + + +#ifdef __DOXYGEN__ + +/** + * Swap bytes in a 16-bit value. + */ +static uint16_t rte_bswap16(uint16_t _x); + +/** + * Swap bytes in a 32-bit value. + */ +static uint32_t rte_bswap32(uint32_t x); + +/** + * Swap bytes in a 64-bit value. + */ +static uint64_t rte_bswap64(uint64_t x); + +/** + * Convert a 16-bit value from CPU order to little endian. + */ +static rte_le16_t rte_cpu_to_le_16(uint16_t x); + +/** + * Convert a 32-bit value from CPU order to little endian. + */ +static rte_le32_t rte_cpu_to_le_32(uint32_t x); + +/** + * Convert a 64-bit value from CPU order to little endian. + */ +static rte_le64_t rte_cpu_to_le_64(uint64_t x); + + +/** + * Convert a 16-bit value from CPU order to big endian. + */ +static rte_be16_t rte_cpu_to_be_16(uint16_t x); + +/** + * Convert a 32-bit value from CPU order to big endian. + */ +static rte_be32_t rte_cpu_to_be_32(uint32_t x); + +/** + * Convert a 64-bit value from CPU order to big endian. + */ +static rte_be64_t rte_cpu_to_be_64(uint64_t x); + + +/** + * Convert a 16-bit value from little endian to CPU order. + */ +static uint16_t rte_le_to_cpu_16(rte_le16_t x); + +/** + * Convert a 32-bit value from little endian to CPU order. + */ +static uint32_t rte_le_to_cpu_32(rte_le32_t x); + +/** + * Convert a 64-bit value from little endian to CPU order. + */ +static uint64_t rte_le_to_cpu_64(rte_le64_t x); + + +/** + * Convert a 16-bit value from big endian to CPU order. + */ +static uint16_t rte_be_to_cpu_16(rte_be16_t x); + +/** + * Convert a 32-bit value from big endian to CPU order. + */ +static uint32_t rte_be_to_cpu_32(rte_be32_t x); + +/** + * Convert a 64-bit value from big endian to CPU order. + */ +static uint64_t rte_be_to_cpu_64(rte_be64_t x); + +#endif /* __DOXYGEN__ */ + +#ifdef RTE_FORCE_INTRINSICS +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +#define rte_bswap16(x) __builtin_bswap16(x) +#endif + +#define rte_bswap32(x) __builtin_bswap32(x) + +#define rte_bswap64(x) __builtin_bswap64(x) + +#endif + +#endif /* _RTE_BYTEORDER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cpuflags.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cpuflags.h new file mode 100644 index 00000000..156ea002 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cpuflags.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_CPUFLAGS_H_ +#define _RTE_CPUFLAGS_H_ + +/** + * @file + * Architecture specific API to determine available CPU features at runtime. + */ + +#include "rte_common.h" +#include + +/** + * Enumeration of all CPU features supported + */ +__extension__ +enum rte_cpu_flag_t; + +/** + * Get name of CPU flag + * + * @param feature + * CPU flag ID + * @return + * flag name + * NULL if flag ID is invalid + */ +__extension__ +const char * +rte_cpu_get_flag_name(enum rte_cpu_flag_t feature); + +/** + * Function for checking a CPU flag availability + * + * @param feature + * CPU flag to query CPU for + * @return + * 1 if flag is available + * 0 if flag is not available + * -ENOENT if flag is invalid + */ +__extension__ +int +rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature); + +/** + * This function checks that the currently used CPU supports the CPU features + * that were specified at compile time. It is called automatically within the + * EAL, so does not need to be used by applications. + */ +__rte_deprecated +void +rte_cpu_check_supported(void); + +/** + * This function checks that the currently used CPU supports the CPU features + * that were specified at compile time. It is called automatically within the + * EAL, so does not need to be used by applications. This version returns a + * result so that decisions may be made (for instance, graceful shutdowns). + */ +int +rte_cpu_is_supported(void); + +/** + * This function attempts to retrieve a value from the auxiliary vector. + * If it is unsuccessful, the result will be 0, and errno will be set. + * + * @return A value from the auxiliary vector. When the value is 0, check + * errno to determine if an error occurred. + */ +unsigned long +rte_cpu_getauxval(unsigned long type); + +/** + * This function retrieves a value from the auxiliary vector, and compares it + * as a string against the value retrieved. + * + * @return The result of calling strcmp() against the value retrieved from + * the auxiliary vector. When the value is 0 (meaning a match is found), + * check errno to determine if an error occurred. + */ +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str); + +#endif /* _RTE_CPUFLAGS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cycles.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cycles.h new file mode 100644 index 00000000..0ff1af50 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_cycles.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#ifndef _RTE_CYCLES_H_ +#define _RTE_CYCLES_H_ + +/** + * @file + * + * Simple Time Reference Functions (Cycles and HPET). + */ + +#include +#include +#include + +#define MS_PER_S 1000 +#define US_PER_S 1000000 +#define NS_PER_S 1000000000 + +enum timer_source { + EAL_TIMER_TSC = 0, + EAL_TIMER_HPET +}; +extern enum timer_source eal_timer_source; + +/** + * Get the measured frequency of the RDTSC counter + * + * @return + * The TSC frequency for this lcore + */ +uint64_t +rte_get_tsc_hz(void); + +/** + * Return the number of TSC cycles since boot + * + * @return + * the number of cycles + */ +static inline uint64_t +rte_get_tsc_cycles(void); + +#ifdef RTE_LIBEAL_USE_HPET +/** + * Return the number of HPET cycles since boot + * + * This counter is global for all execution units. The number of + * cycles in one second can be retrieved using rte_get_hpet_hz(). + * + * @return + * the number of cycles + */ +uint64_t +rte_get_hpet_cycles(void); + +/** + * Get the number of HPET cycles in one second. + * + * @return + * The number of cycles in one second. + */ +uint64_t +rte_get_hpet_hz(void); + +/** + * Initialise the HPET for use. This must be called before the rte_get_hpet_hz + * and rte_get_hpet_cycles APIs are called. If this function does not succeed, + * then the HPET functions are unavailable and should not be called. + * + * @param make_default + * If set, the hpet timer becomes the default timer whose values are + * returned by the rte_get_timer_hz/cycles API calls + * + * @return + * 0 on success, + * -1 on error, and the make_default parameter is ignored. + */ +int rte_eal_hpet_init(int make_default); + +#endif + +/** + * Get the number of cycles since boot from the default timer. + * + * @return + * The number of cycles + */ +static inline uint64_t +rte_get_timer_cycles(void) +{ +#ifdef RTE_LIBEAL_USE_HPET + switch(eal_timer_source) { + case EAL_TIMER_TSC: +#endif + return rte_get_tsc_cycles(); +#ifdef RTE_LIBEAL_USE_HPET + case EAL_TIMER_HPET: + return rte_get_hpet_cycles(); + default: rte_panic("Invalid timer source specified\n"); + } +#endif +} + +/** + * Get the number of cycles in one second for the default timer. + * + * @return + * The number of cycles in one second. + */ +static inline uint64_t +rte_get_timer_hz(void) +{ +#ifdef RTE_LIBEAL_USE_HPET + switch(eal_timer_source) { + case EAL_TIMER_TSC: +#endif + return rte_get_tsc_hz(); +#ifdef RTE_LIBEAL_USE_HPET + case EAL_TIMER_HPET: + return rte_get_hpet_hz(); + default: rte_panic("Invalid timer source specified\n"); + } +#endif +} +/** + * Wait at least us microseconds. + * This function can be replaced with user-defined function. + * @see rte_delay_us_callback_register + * + * @param us + * The number of microseconds to wait. + */ +extern void +(*rte_delay_us)(unsigned int us); + +/** + * Wait at least ms milliseconds. + * + * @param ms + * The number of milliseconds to wait. + */ +static inline void +rte_delay_ms(unsigned ms) +{ + rte_delay_us(ms * 1000); +} + +/** + * Blocking delay function. + * + * @param us + * Number of microseconds to wait. + */ +void rte_delay_us_block(unsigned int us); + +/** + * Replace rte_delay_us with user defined function. + * + * @param userfunc + * User function which replaces rte_delay_us. rte_delay_us_block restores + * buildin block delay function. + */ +void rte_delay_us_callback_register(void(*userfunc)(unsigned int)); + +#endif /* _RTE_CYCLES_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_io.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_io.h new file mode 100644 index 00000000..da457f7f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_io.h @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_IO_H_ +#define _RTE_IO_H_ + +/** + * @file + * I/O device memory operations + * + * This file defines the generic API for I/O device memory read/write operations + */ + +#include +#include +#include + +#ifdef __DOXYGEN__ + +/** + * Read a 8-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint8_t +rte_read8_relaxed(const volatile void *addr); + +/** + * Read a 16-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint16_t +rte_read16_relaxed(const volatile void *addr); + +/** + * Read a 32-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint32_t +rte_read32_relaxed(const volatile void *addr); + +/** + * Read a 64-bit value from I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint64_t +rte_read64_relaxed(const volatile void *addr); + +/** + * Write a 8-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ + +static inline void +rte_write8_relaxed(uint8_t value, volatile void *addr); + +/** + * Write a 16-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write16_relaxed(uint16_t value, volatile void *addr); + +/** + * Write a 32-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write32_relaxed(uint32_t value, volatile void *addr); + +/** + * Write a 64-bit value to I/O device memory address *addr*. + * + * The relaxed version does not have additional I/O memory barrier, useful in + * accessing the device registers of integrated controllers which implicitly + * strongly ordered with respect to memory access. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write64_relaxed(uint64_t value, volatile void *addr); + +/** + * Read a 8-bit value from I/O device memory address *addr*. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint8_t +rte_read8(const volatile void *addr); + +/** + * Read a 16-bit value from I/O device memory address *addr*. + * + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint16_t +rte_read16(const volatile void *addr); + +/** + * Read a 32-bit value from I/O device memory address *addr*. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint32_t +rte_read32(const volatile void *addr); + +/** + * Read a 64-bit value from I/O device memory address *addr*. + * + * @param addr + * I/O memory address to read the value from + * @return + * read value + */ +static inline uint64_t +rte_read64(const volatile void *addr); + +/** + * Write a 8-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ + +static inline void +rte_write8(uint8_t value, volatile void *addr); + +/** + * Write a 16-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write16(uint16_t value, volatile void *addr); + +/** + * Write a 32-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write32(uint32_t value, volatile void *addr); + +/** + * Write a 64-bit value to I/O device memory address *addr*. + * + * @param value + * Value to write + * @param addr + * I/O memory address to write the value to + */ +static inline void +rte_write64(uint64_t value, volatile void *addr); + +#endif /* __DOXYGEN__ */ + +#ifndef RTE_OVERRIDE_IO_H + +static __rte_always_inline uint8_t +rte_read8_relaxed(const volatile void *addr) +{ + return *(const volatile uint8_t *)addr; +} + +static __rte_always_inline uint16_t +rte_read16_relaxed(const volatile void *addr) +{ + return *(const volatile uint16_t *)addr; +} + +static __rte_always_inline uint32_t +rte_read32_relaxed(const volatile void *addr) +{ + return *(const volatile uint32_t *)addr; +} + +static __rte_always_inline uint64_t +rte_read64_relaxed(const volatile void *addr) +{ + return *(const volatile uint64_t *)addr; +} + +static __rte_always_inline void +rte_write8_relaxed(uint8_t value, volatile void *addr) +{ + *(volatile uint8_t *)addr = value; +} + +static __rte_always_inline void +rte_write16_relaxed(uint16_t value, volatile void *addr) +{ + *(volatile uint16_t *)addr = value; +} + +static __rte_always_inline void +rte_write32_relaxed(uint32_t value, volatile void *addr) +{ + *(volatile uint32_t *)addr = value; +} + +static __rte_always_inline void +rte_write64_relaxed(uint64_t value, volatile void *addr) +{ + *(volatile uint64_t *)addr = value; +} + +static __rte_always_inline uint8_t +rte_read8(const volatile void *addr) +{ + uint8_t val; + val = rte_read8_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint16_t +rte_read16(const volatile void *addr) +{ + uint16_t val; + val = rte_read16_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint32_t +rte_read32(const volatile void *addr) +{ + uint32_t val; + val = rte_read32_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline uint64_t +rte_read64(const volatile void *addr) +{ + uint64_t val; + val = rte_read64_relaxed(addr); + rte_io_rmb(); + return val; +} + +static __rte_always_inline void +rte_write8(uint8_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write8_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write16(uint16_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write16_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write32(uint32_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write32_relaxed(value, addr); +} + +static __rte_always_inline void +rte_write64(uint64_t value, volatile void *addr) +{ + rte_io_wmb(); + rte_write64_relaxed(value, addr); +} + +#endif /* RTE_OVERRIDE_IO_H */ + +#endif /* _RTE_IO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_memcpy.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_memcpy.h new file mode 100644 index 00000000..701e550c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_memcpy.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMCPY_H_ +#define _RTE_MEMCPY_H_ + +/** + * @file + * + * Functions for vectorised implementation of memcpy(). + */ + +/** + * Copy 16 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src); + +/** + * Copy 32 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src); + +#ifdef __DOXYGEN__ + +/** + * Copy 48 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src); + +#endif /* __DOXYGEN__ */ + +/** + * Copy 64 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src); + +/** + * Copy 128 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src); + +/** + * Copy 256 bytes from one location to another using optimised + * instructions. The locations should not overlap. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + */ +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src); + +#ifdef __DOXYGEN__ + +/** + * Copy bytes from one location to another. The locations must not overlap. + * + * @note This is implemented as a macro, so it's address should not be taken + * and care is needed as parameter expressions may be evaluated multiple times. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + * @param n + * Number of bytes to copy. + * @return + * Pointer to the destination data. + */ +static void * +rte_memcpy(void *dst, const void *src, size_t n); + +#endif /* __DOXYGEN__ */ + +#endif /* _RTE_MEMCPY_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_pause.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_pause.h new file mode 100644 index 00000000..52bd4db5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_pause.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _RTE_PAUSE_H_ +#define _RTE_PAUSE_H_ + +/** + * @file + * + * CPU pause operation. + * + */ + +/** + * Pause CPU execution for a short while + * + * This call is intended for tight loops which poll a shared resource or wait + * for an event. A short pause within the loop may reduce the power consumption. + */ +static inline void rte_pause(void); + +#endif /* _RTE_PAUSE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_prefetch.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_prefetch.h new file mode 100644 index 00000000..6e47bdfb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_prefetch.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_PREFETCH_H_ +#define _RTE_PREFETCH_H_ + +/** + * @file + * + * Prefetch operations. + * + * This file defines an API for prefetch macros / inline-functions, + * which are architecture-dependent. Prefetching occurs when a + * processor requests an instruction or data from memory to cache + * before it is actually needed, potentially speeding up the execution of the + * program. + */ + +/** + * Prefetch a cache line into all cache levels. + * @param p + * Address to prefetch + */ +static inline void rte_prefetch0(const volatile void *p); + +/** + * Prefetch a cache line into all cache levels except the 0th cache level. + * @param p + * Address to prefetch + */ +static inline void rte_prefetch1(const volatile void *p); + +/** + * Prefetch a cache line into all cache levels except the 0th and 1th cache + * levels. + * @param p + * Address to prefetch + */ +static inline void rte_prefetch2(const volatile void *p); + +/** + * Prefetch a cache line into all cache levels (non-temporal/transient version) + * + * The non-temporal prefetch is intended as a prefetch hint that processor will + * use the prefetched data only once or short period, unlike the + * rte_prefetch0() function which imply that prefetched data to use repeatedly. + * + * @param p + * Address to prefetch + */ +static inline void rte_prefetch_non_temporal(const volatile void *p); + +#endif /* _RTE_PREFETCH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_rwlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_rwlock.h new file mode 100644 index 00000000..5751a0e6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_rwlock.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_RWLOCK_H_ +#define _RTE_RWLOCK_H_ + +/** + * @file + * + * RTE Read-Write Locks + * + * This file defines an API for read-write locks. The lock is used to + * protect data that allows multiple readers in parallel, but only + * one writer. All readers are blocked until the writer is finished + * writing. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/** + * The rte_rwlock_t type. + * + * cnt is -1 when write lock is held, and > 0 when read locks are held. + */ +typedef struct { + volatile int32_t cnt; /**< -1 when W lock held, > 0 when R locks held. */ +} rte_rwlock_t; + +/** + * A static rwlock initializer. + */ +#define RTE_RWLOCK_INITIALIZER { 0 } + +/** + * Initialize the rwlock to an unlocked state. + * + * @param rwl + * A pointer to the rwlock structure. + */ +static inline void +rte_rwlock_init(rte_rwlock_t *rwl) +{ + rwl->cnt = 0; +} + +/** + * Take a read lock. Loop until the lock is held. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_read_lock(rte_rwlock_t *rwl) +{ + int32_t x; + int success = 0; + + while (success == 0) { + x = rwl->cnt; + /* write lock is held */ + if (x < 0) { + rte_pause(); + continue; + } + success = rte_atomic32_cmpset((volatile uint32_t *)&rwl->cnt, + (uint32_t)x, (uint32_t)(x + 1)); + } +} + +/** + * Release a read lock. + * + * @param rwl + * A pointer to the rwlock structure. + */ +static inline void +rte_rwlock_read_unlock(rte_rwlock_t *rwl) +{ + rte_atomic32_dec((rte_atomic32_t *)(intptr_t)&rwl->cnt); +} + +/** + * Take a write lock. Loop until the lock is held. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_lock(rte_rwlock_t *rwl) +{ + int32_t x; + int success = 0; + + while (success == 0) { + x = rwl->cnt; + /* a lock is held */ + if (x != 0) { + rte_pause(); + continue; + } + success = rte_atomic32_cmpset((volatile uint32_t *)&rwl->cnt, + 0, (uint32_t)-1); + } +} + +/** + * Release a write lock. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_unlock(rte_rwlock_t *rwl) +{ + rte_atomic32_inc((rte_atomic32_t *)(intptr_t)&rwl->cnt); +} + +/** + * Try to execute critical section in a hardware memory transaction, if it + * fails or not available take a read lock + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_read_lock_tm(rte_rwlock_t *rwl); + +/** + * Commit hardware memory transaction or release the read lock if the lock is used as a fall-back + * + * @param rwl + * A pointer to the rwlock structure. + */ +static inline void +rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl); + +/** + * Try to execute critical section in a hardware memory transaction, if it + * fails or not available take a write lock + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_lock_tm(rte_rwlock_t *rwl); + +/** + * Commit hardware memory transaction or release the write lock if the lock is used as a fall-back + * + * @param rwl + * A pointer to a rwlock structure. + */ +static inline void +rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RWLOCK_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_spinlock.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_spinlock.h new file mode 100644 index 00000000..c4c3fc31 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_spinlock.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_SPINLOCK_H_ +#define _RTE_SPINLOCK_H_ + +/** + * @file + * + * RTE Spinlocks + * + * This file defines an API for read-write locks, which are implemented + * in an architecture-specific way. This kind of lock simply waits in + * a loop repeatedly checking until the lock becomes available. + * + * All locks must be initialised before use, and only initialised once. + * + */ + +#include +#ifdef RTE_FORCE_INTRINSICS +#include +#endif +#include + +/** + * The rte_spinlock_t type. + */ +typedef struct { + volatile int locked; /**< lock status 0 = unlocked, 1 = locked */ +} rte_spinlock_t; + +/** + * A static spinlock initializer. + */ +#define RTE_SPINLOCK_INITIALIZER { 0 } + +/** + * Initialize the spinlock to an unlocked state. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_init(rte_spinlock_t *sl) +{ + sl->locked = 0; +} + +/** + * Take the spinlock. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_lock(rte_spinlock_t *sl); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_spinlock_lock(rte_spinlock_t *sl) +{ + while (__sync_lock_test_and_set(&sl->locked, 1)) + while(sl->locked) + rte_pause(); +} +#endif + +/** + * Release the spinlock. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_unlock (rte_spinlock_t *sl); + +#ifdef RTE_FORCE_INTRINSICS +static inline void +rte_spinlock_unlock (rte_spinlock_t *sl) +{ + __sync_lock_release(&sl->locked); +} +#endif + +/** + * Try to take the lock. + * + * @param sl + * A pointer to the spinlock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +static inline int +rte_spinlock_trylock (rte_spinlock_t *sl); + +#ifdef RTE_FORCE_INTRINSICS +static inline int +rte_spinlock_trylock (rte_spinlock_t *sl) +{ + return __sync_lock_test_and_set(&sl->locked,1) == 0; +} +#endif + +/** + * Test if the lock is taken. + * + * @param sl + * A pointer to the spinlock. + * @return + * 1 if the lock is currently taken; 0 otherwise. + */ +static inline int rte_spinlock_is_locked (rte_spinlock_t *sl) +{ + return sl->locked; +} + +/** + * Test if hardware transactional memory (lock elision) is supported + * + * @return + * 1 if the hardware transactional memory is supported; 0 otherwise. + */ +static inline int rte_tm_supported(void); + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available take the spinlock. + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_lock_tm(rte_spinlock_t *sl); + +/** + * Commit hardware memory transaction or release the spinlock if + * the spinlock is used as a fall-back + * + * @param sl + * A pointer to the spinlock. + */ +static inline void +rte_spinlock_unlock_tm(rte_spinlock_t *sl); + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available try to take the lock. + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param sl + * A pointer to the spinlock. + * @return + * 1 if the hardware memory transaction is successfully started + * or lock is successfully taken; 0 otherwise. + */ +static inline int +rte_spinlock_trylock_tm(rte_spinlock_t *sl); + +/** + * The rte_spinlock_recursive_t type. + */ +typedef struct { + rte_spinlock_t sl; /**< the actual spinlock */ + volatile int user; /**< core id using lock, -1 for unused */ + volatile int count; /**< count of time this lock has been called */ +} rte_spinlock_recursive_t; + +/** + * A static recursive spinlock initializer. + */ +#define RTE_SPINLOCK_RECURSIVE_INITIALIZER {RTE_SPINLOCK_INITIALIZER, -1, 0} + +/** + * Initialize the recursive spinlock to an unlocked state. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_init(rte_spinlock_recursive_t *slr) +{ + rte_spinlock_init(&slr->sl); + slr->user = -1; + slr->count = 0; +} + +/** + * Take the recursive spinlock. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_lock(rte_spinlock_recursive_t *slr) +{ + int id = rte_gettid(); + + if (slr->user != id) { + rte_spinlock_lock(&slr->sl); + slr->user = id; + } + slr->count++; +} +/** + * Release the recursive spinlock. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_unlock(rte_spinlock_recursive_t *slr) +{ + if (--(slr->count) == 0) { + slr->user = -1; + rte_spinlock_unlock(&slr->sl); + } + +} + +/** + * Try to take the recursive lock. + * + * @param slr + * A pointer to the recursive spinlock. + * @return + * 1 if the lock is successfully taken; 0 otherwise. + */ +static inline int rte_spinlock_recursive_trylock(rte_spinlock_recursive_t *slr) +{ + int id = rte_gettid(); + + if (slr->user != id) { + if (rte_spinlock_trylock(&slr->sl) == 0) + return 0; + slr->user = id; + } + slr->count++; + return 1; +} + + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available take the recursive spinlocks + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_lock_tm( + rte_spinlock_recursive_t *slr); + +/** + * Commit hardware memory transaction or release the recursive spinlock + * if the recursive spinlock is used as a fall-back + * + * @param slr + * A pointer to the recursive spinlock. + */ +static inline void rte_spinlock_recursive_unlock_tm( + rte_spinlock_recursive_t *slr); + +/** + * Try to execute critical section in a hardware memory transaction, + * if it fails or not available try to take the recursive lock + * + * NOTE: An attempt to perform a HW I/O operation inside a hardware memory + * transaction always aborts the transaction since the CPU is not able to + * roll-back should the transaction fail. Therefore, hardware transactional + * locks are not advised to be used around rte_eth_rx_burst() and + * rte_eth_tx_burst() calls. + * + * @param slr + * A pointer to the recursive spinlock. + * @return + * 1 if the hardware memory transaction is successfully started + * or lock is successfully taken; 0 otherwise. + */ +static inline int rte_spinlock_recursive_trylock_tm( + rte_spinlock_recursive_t *slr); + +#endif /* _RTE_SPINLOCK_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_vect.h b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_vect.h new file mode 100644 index 00000000..11c6475b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/generic/rte_vect.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + */ + +#ifndef _RTE_VECT_H_ +#define _RTE_VECT_H_ + +/** + * @file + * SIMD vector types + * + * This file defines types to use vector instructions with generic C code. + */ + +#include + +/* Unsigned vector types */ + +/** + * 64 bits vector size to use with unsigned 8 bits elements. + * + * a = (rte_v64u8_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef uint8_t rte_v64u8_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with unsigned 16 bits elements. + * + * a = (rte_v64u16_t){ a0, a1, a2, a3 } + */ +typedef uint16_t rte_v64u16_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with unsigned 32 bits elements. + * + * a = (rte_v64u32_t){ a0, a1 } + */ +typedef uint32_t rte_v64u32_t __attribute__((vector_size(8), aligned(8))); + +/** + * 128 bits vector size to use with unsigned 8 bits elements. + * + * a = (rte_v128u8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef uint8_t rte_v128u8_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with unsigned 16 bits elements. + * + * a = (rte_v128u16_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef uint16_t rte_v128u16_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with unsigned 32 bits elements. + * + * a = (rte_v128u32_t){ a0, a1, a2, a3, a4 } + */ +typedef uint32_t rte_v128u32_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with unsigned 64 bits elements. + * + * a = (rte_v128u64_t){ a0, a1 } + */ +typedef uint64_t rte_v128u64_t __attribute__((vector_size(16), aligned(16))); + +/** + * 256 bits vector size to use with unsigned 8 bits elements. + * + * a = (rte_v256u8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15, + * a16, a17, a18, a19, a20, a21, a22, a23, + * a24, a25, a26, a27, a28, a29, a30, a31 } + */ +typedef uint8_t rte_v256u8_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with unsigned 16 bits elements. + * + * a = (rte_v256u16_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef uint16_t rte_v256u16_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with unsigned 32 bits elements. + * + * a = (rte_v256u32_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef uint32_t rte_v256u32_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with unsigned 64 bits elements. + * + * a = (rte_v256u64_t){ a0, a1, a2, a3 } + */ +typedef uint64_t rte_v256u64_t __attribute__((vector_size(32), aligned(32))); + + +/* Signed vector types */ + +/** + * 64 bits vector size to use with 8 bits elements. + * + * a = (rte_v64s8_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef int8_t rte_v64s8_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with 16 bits elements. + * + * a = (rte_v64s16_t){ a0, a1, a2, a3 } + */ +typedef int16_t rte_v64s16_t __attribute__((vector_size(8), aligned(8))); + +/** + * 64 bits vector size to use with 32 bits elements. + * + * a = (rte_v64s32_t){ a0, a1 } + */ +typedef int32_t rte_v64s32_t __attribute__((vector_size(8), aligned(8))); + +/** + * 128 bits vector size to use with 8 bits elements. + * + * a = (rte_v128s8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef int8_t rte_v128s8_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with 16 bits elements. + * + * a = (rte_v128s16_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef int16_t rte_v128s16_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with 32 bits elements. + * + * a = (rte_v128s32_t){ a0, a1, a2, a3 } + */ +typedef int32_t rte_v128s32_t __attribute__((vector_size(16), aligned(16))); + +/** + * 128 bits vector size to use with 64 bits elements. + * + * a = (rte_v128s64_t){ a1, a2 } + */ +typedef int64_t rte_v128s64_t __attribute__((vector_size(16), aligned(16))); + +/** + * 256 bits vector size to use with 8 bits elements. + * + * a = (rte_v256s8_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15, + * a16, a17, a18, a19, a20, a21, a22, a23, + * a24, a25, a26, a27, a28, a29, a30, a31 } + */ +typedef int8_t rte_v256s8_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with 16 bits elements. + * + * a = (rte_v256s16_t){ a00, a01, a02, a03, a04, a05, a06, a07, + * a08, a09, a10, a11, a12, a13, a14, a15 } + */ +typedef int16_t rte_v256s16_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with 32 bits elements. + * + * a = (rte_v256s32_t){ a0, a1, a2, a3, a4, a5, a6, a7 } + */ +typedef int32_t rte_v256s32_t __attribute__((vector_size(32), aligned(32))); + +/** + * 256 bits vector size to use with 64 bits elements. + * + * a = (rte_v256s64_t){ a0, a1, a2, a3 } + */ +typedef int64_t rte_v256s64_t __attribute__((vector_size(32), aligned(32))); + +#endif /* _RTE_VECT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_alarm.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_alarm.h new file mode 100644 index 00000000..7e4d0b24 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_alarm.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ALARM_H_ +#define _RTE_ALARM_H_ + +/** + * @file + * + * Alarm functions + * + * Simple alarm-clock functionality supplied by eal. + * Does not require hpet support. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * Signature of callback back function called when an alarm goes off. + */ +typedef void (*rte_eal_alarm_callback)(void *arg); + +/** + * Function to set a callback to be triggered when us microseconds + * have expired. Accuracy of timing to the microsecond is not guaranteed. The + * alarm function will not be called *before* the requested time, but may + * be called a short period of time afterwards. + * The alarm handler will be called only once. There is no need to call + * "rte_eal_alarm_cancel" from within the callback function. + * + * @param us + * The time in microseconds before the callback is called + * @param cb + * The function to be called when the alarm expires + * @param cb_arg + * Pointer parameter to be passed to the callback function + * + * @return + * On success, zero. + * On failure, a negative error number + */ +int rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb, void *cb_arg); + +/** + * Function to cancel an alarm callback which has been registered before. If + * used outside alarm callback it wait for all callbacks to finish execution. + * + * @param cb_fn + * alarm callback + * @param cb_arg + * Pointer parameter to be passed to the callback function. To remove all + * copies of a given callback function, irrespective of parameter, (void *)-1 + * can be used here. + * + * @return + * - value greater than 0 and rte_errno not changed - returned value is + * the number of canceled alarm callback functions + * - value greater or equal 0 and rte_errno set to EINPROGRESS, at least one + * alarm could not be canceled because cancellation was requested from alarm + * callback context. Returned value is the number of successfully canceled + * alarm callbacks + * - 0 and rte_errno set to ENOENT - no alarm found + * - -1 and rte_errno set to EINVAL - invalid parameter (NULL callback) + */ +int rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_ALARM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_bitmap.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_bitmap.h new file mode 100644 index 00000000..d9facc64 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_bitmap.h @@ -0,0 +1,533 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_BITMAP_H__ +#define __INCLUDE_RTE_BITMAP_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Bitmap + * + * The bitmap component provides a mechanism to manage large arrays of bits + * through bit get/set/clear and bit array scan operations. + * + * The bitmap scan operation is optimized for 64-bit CPUs using 64/128 byte cache + * lines. The bitmap is hierarchically organized using two arrays (array1 and + * array2), with each bit in array1 being associated with a full cache line + * (512/1024 bits) of bitmap bits, which are stored in array2: the bit in array1 + * is set only when there is at least one bit set within its associated array2 + * bits, otherwise the bit in array1 is cleared. The read and write operations + * for array1 and array2 are always done in slabs of 64 bits. + * + * This bitmap is not thread safe. For lock free operation on a specific bitmap + * instance, a single writer thread performing bit set/clear operations is + * allowed, only the writer thread can do bitmap scan operations, while there + * can be several reader threads performing bit get operations in parallel with + * the writer thread. When the use of locking primitives is acceptable, the + * serialization of the bit set/clear and bitmap scan operations needs to be + * enforced by the caller, while the bit get operation does not require locking + * the bitmap. + * + ***/ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef RTE_BITMAP_OPTIMIZATIONS +#define RTE_BITMAP_OPTIMIZATIONS 1 +#endif + +/* Slab */ +#define RTE_BITMAP_SLAB_BIT_SIZE 64 +#define RTE_BITMAP_SLAB_BIT_SIZE_LOG2 6 +#define RTE_BITMAP_SLAB_BIT_MASK (RTE_BITMAP_SLAB_BIT_SIZE - 1) + +/* Cache line (CL) */ +#define RTE_BITMAP_CL_BIT_SIZE (RTE_CACHE_LINE_SIZE * 8) +#define RTE_BITMAP_CL_BIT_SIZE_LOG2 (RTE_CACHE_LINE_SIZE_LOG2 + 3) +#define RTE_BITMAP_CL_BIT_MASK (RTE_BITMAP_CL_BIT_SIZE - 1) + +#define RTE_BITMAP_CL_SLAB_SIZE (RTE_BITMAP_CL_BIT_SIZE / RTE_BITMAP_SLAB_BIT_SIZE) +#define RTE_BITMAP_CL_SLAB_SIZE_LOG2 (RTE_BITMAP_CL_BIT_SIZE_LOG2 - RTE_BITMAP_SLAB_BIT_SIZE_LOG2) +#define RTE_BITMAP_CL_SLAB_MASK (RTE_BITMAP_CL_SLAB_SIZE - 1) + +/** Bitmap data structure */ +struct rte_bitmap { + /* Context for array1 and array2 */ + uint64_t *array1; /**< Bitmap array1 */ + uint64_t *array2; /**< Bitmap array2 */ + uint32_t array1_size; /**< Number of 64-bit slabs in array1 that are actually used */ + uint32_t array2_size; /**< Number of 64-bit slabs in array2 */ + + /* Context for the "scan next" operation */ + uint32_t index1; /**< Bitmap scan: Index of current array1 slab */ + uint32_t offset1; /**< Bitmap scan: Offset of current bit within current array1 slab */ + uint32_t index2; /**< Bitmap scan: Index of current array2 slab */ + uint32_t go2; /**< Bitmap scan: Go/stop condition for current array2 cache line */ + + /* Storage space for array1 and array2 */ + uint8_t memory[]; +}; + +static inline void +__rte_bitmap_index1_inc(struct rte_bitmap *bmp) +{ + bmp->index1 = (bmp->index1 + 1) & (bmp->array1_size - 1); +} + +static inline uint64_t +__rte_bitmap_mask1_get(struct rte_bitmap *bmp) +{ + return (~1lu) << bmp->offset1; +} + +static inline void +__rte_bitmap_index2_set(struct rte_bitmap *bmp) +{ + bmp->index2 = (((bmp->index1 << RTE_BITMAP_SLAB_BIT_SIZE_LOG2) + bmp->offset1) << RTE_BITMAP_CL_SLAB_SIZE_LOG2); +} + +#if RTE_BITMAP_OPTIMIZATIONS + +static inline int +rte_bsf64(uint64_t slab, uint32_t *pos) +{ + if (likely(slab == 0)) { + return 0; + } + + *pos = __builtin_ctzll(slab); + return 1; +} + +#else + +static inline int +rte_bsf64(uint64_t slab, uint32_t *pos) +{ + uint64_t mask; + uint32_t i; + + if (likely(slab == 0)) { + return 0; + } + + for (i = 0, mask = 1; i < RTE_BITMAP_SLAB_BIT_SIZE; i ++, mask <<= 1) { + if (unlikely(slab & mask)) { + *pos = i; + return 1; + } + } + + return 0; +} + +#endif + +static inline uint32_t +__rte_bitmap_get_memory_footprint(uint32_t n_bits, + uint32_t *array1_byte_offset, uint32_t *array1_slabs, + uint32_t *array2_byte_offset, uint32_t *array2_slabs) +{ + uint32_t n_slabs_context, n_slabs_array1, n_cache_lines_context_and_array1; + uint32_t n_cache_lines_array2; + uint32_t n_bytes_total; + + n_cache_lines_array2 = (n_bits + RTE_BITMAP_CL_BIT_SIZE - 1) / RTE_BITMAP_CL_BIT_SIZE; + n_slabs_array1 = (n_cache_lines_array2 + RTE_BITMAP_SLAB_BIT_SIZE - 1) / RTE_BITMAP_SLAB_BIT_SIZE; + n_slabs_array1 = rte_align32pow2(n_slabs_array1); + n_slabs_context = (sizeof(struct rte_bitmap) + (RTE_BITMAP_SLAB_BIT_SIZE / 8) - 1) / (RTE_BITMAP_SLAB_BIT_SIZE / 8); + n_cache_lines_context_and_array1 = (n_slabs_context + n_slabs_array1 + RTE_BITMAP_CL_SLAB_SIZE - 1) / RTE_BITMAP_CL_SLAB_SIZE; + n_bytes_total = (n_cache_lines_context_and_array1 + n_cache_lines_array2) * RTE_CACHE_LINE_SIZE; + + if (array1_byte_offset) { + *array1_byte_offset = n_slabs_context * (RTE_BITMAP_SLAB_BIT_SIZE / 8); + } + if (array1_slabs) { + *array1_slabs = n_slabs_array1; + } + if (array2_byte_offset) { + *array2_byte_offset = n_cache_lines_context_and_array1 * RTE_CACHE_LINE_SIZE; + } + if (array2_slabs) { + *array2_slabs = n_cache_lines_array2 * RTE_BITMAP_CL_SLAB_SIZE; + } + + return n_bytes_total; +} + +static inline void +__rte_bitmap_scan_init(struct rte_bitmap *bmp) +{ + bmp->index1 = bmp->array1_size - 1; + bmp->offset1 = RTE_BITMAP_SLAB_BIT_SIZE - 1; + __rte_bitmap_index2_set(bmp); + bmp->index2 += RTE_BITMAP_CL_SLAB_SIZE; + + bmp->go2 = 0; +} + +/** + * Bitmap memory footprint calculation + * + * @param n_bits + * Number of bits in the bitmap + * @return + * Bitmap memory footprint measured in bytes on success, 0 on error + */ +static inline uint32_t +rte_bitmap_get_memory_footprint(uint32_t n_bits) { + /* Check input arguments */ + if (n_bits == 0) { + return 0; + } + + return __rte_bitmap_get_memory_footprint(n_bits, NULL, NULL, NULL, NULL); +} + +/** + * Bitmap initialization + * + * @param n_bits + * Number of pre-allocated bits in array2. + * @param mem + * Base address of array1 and array2. + * @param mem_size + * Minimum expected size of bitmap. + * @return + * Handle to bitmap instance. + */ +static inline struct rte_bitmap * +rte_bitmap_init(uint32_t n_bits, uint8_t *mem, uint32_t mem_size) +{ + struct rte_bitmap *bmp; + uint32_t array1_byte_offset, array1_slabs, array2_byte_offset, array2_slabs; + uint32_t size; + + /* Check input arguments */ + if (n_bits == 0) { + return NULL; + } + + if ((mem == NULL) || (((uintptr_t) mem) & RTE_CACHE_LINE_MASK)) { + return NULL; + } + + size = __rte_bitmap_get_memory_footprint(n_bits, + &array1_byte_offset, &array1_slabs, + &array2_byte_offset, &array2_slabs); + if (size < mem_size) { + return NULL; + } + + /* Setup bitmap */ + memset(mem, 0, size); + bmp = (struct rte_bitmap *) mem; + + bmp->array1 = (uint64_t *) &mem[array1_byte_offset]; + bmp->array1_size = array1_slabs; + bmp->array2 = (uint64_t *) &mem[array2_byte_offset]; + bmp->array2_size = array2_slabs; + + __rte_bitmap_scan_init(bmp); + + return bmp; +} + +/** + * Bitmap free + * + * @param bmp + * Handle to bitmap instance + * @return + * 0 upon success, error code otherwise + */ +static inline int +rte_bitmap_free(struct rte_bitmap *bmp) +{ + /* Check input arguments */ + if (bmp == NULL) { + return -1; + } + + return 0; +} + +/** + * Bitmap reset + * + * @param bmp + * Handle to bitmap instance + */ +static inline void +rte_bitmap_reset(struct rte_bitmap *bmp) +{ + memset(bmp->array1, 0, bmp->array1_size * sizeof(uint64_t)); + memset(bmp->array2, 0, bmp->array2_size * sizeof(uint64_t)); + __rte_bitmap_scan_init(bmp); +} + +/** + * Bitmap location prefetch into CPU L1 cache + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + * @return + * 0 upon success, error code otherwise + */ +static inline void +rte_bitmap_prefetch0(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab2; + uint32_t index2; + + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + slab2 = bmp->array2 + index2; + rte_prefetch0((void *) slab2); +} + +/** + * Bitmap bit get + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + * @return + * 0 when bit is cleared, non-zero when bit is set + */ +static inline uint64_t +rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab2; + uint32_t index2, offset2; + + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + return (*slab2) & (1lu << offset2); +} + +/** + * Bitmap bit set + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + */ +static inline void +rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab1, *slab2; + uint32_t index1, index2, offset1, offset2; + + /* Set bit in array2 slab and set bit in array1 slab */ + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; + index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); + offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + slab1 = bmp->array1 + index1; + + *slab2 |= 1lu << offset2; + *slab1 |= 1lu << offset1; +} + +/** + * Bitmap slab set + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position identifying the array2 slab + * @param slab + * Value to be assigned to the 64-bit slab in array2 + */ +static inline void +rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab) +{ + uint64_t *slab1, *slab2; + uint32_t index1, index2, offset1; + + /* Set bits in array2 slab and set bit in array1 slab */ + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); + offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + slab1 = bmp->array1 + index1; + + *slab2 |= slab; + *slab1 |= 1lu << offset1; +} + +static inline uint64_t +__rte_bitmap_line_not_empty(uint64_t *slab2) +{ + uint64_t v1, v2, v3, v4; + + v1 = slab2[0] | slab2[1]; + v2 = slab2[2] | slab2[3]; + v3 = slab2[4] | slab2[5]; + v4 = slab2[6] | slab2[7]; + v1 |= v2; + v3 |= v4; + + return v1 | v3; +} + +/** + * Bitmap bit clear + * + * @param bmp + * Handle to bitmap instance + * @param pos + * Bit position + */ +static inline void +rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) +{ + uint64_t *slab1, *slab2; + uint32_t index1, index2, offset1, offset2; + + /* Clear bit in array2 slab */ + index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; + slab2 = bmp->array2 + index2; + + /* Return if array2 slab is not all-zeros */ + *slab2 &= ~(1lu << offset2); + if (*slab2){ + return; + } + + /* Check the entire cache line of array2 for all-zeros */ + index2 &= ~ RTE_BITMAP_CL_SLAB_MASK; + slab2 = bmp->array2 + index2; + if (__rte_bitmap_line_not_empty(slab2)) { + return; + } + + /* The array2 cache line is all-zeros, so clear bit in array1 slab */ + index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); + offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; + slab1 = bmp->array1 + index1; + *slab1 &= ~(1lu << offset1); + + return; +} + +static inline int +__rte_bitmap_scan_search(struct rte_bitmap *bmp) +{ + uint64_t value1; + uint32_t i; + + /* Check current array1 slab */ + value1 = bmp->array1[bmp->index1]; + value1 &= __rte_bitmap_mask1_get(bmp); + + if (rte_bsf64(value1, &bmp->offset1)) { + return 1; + } + + __rte_bitmap_index1_inc(bmp); + bmp->offset1 = 0; + + /* Look for another array1 slab */ + for (i = 0; i < bmp->array1_size; i ++, __rte_bitmap_index1_inc(bmp)) { + value1 = bmp->array1[bmp->index1]; + + if (rte_bsf64(value1, &bmp->offset1)) { + return 1; + } + } + + return 0; +} + +static inline void +__rte_bitmap_scan_read_init(struct rte_bitmap *bmp) +{ + __rte_bitmap_index2_set(bmp); + bmp->go2 = 1; + rte_prefetch1((void *)(bmp->array2 + bmp->index2 + 8)); +} + +static inline int +__rte_bitmap_scan_read(struct rte_bitmap *bmp, uint32_t *pos, uint64_t *slab) +{ + uint64_t *slab2; + + slab2 = bmp->array2 + bmp->index2; + for ( ; bmp->go2 ; bmp->index2 ++, slab2 ++, bmp->go2 = bmp->index2 & RTE_BITMAP_CL_SLAB_MASK) { + if (*slab2) { + *pos = bmp->index2 << RTE_BITMAP_SLAB_BIT_SIZE_LOG2; + *slab = *slab2; + + bmp->index2 ++; + slab2 ++; + bmp->go2 = bmp->index2 & RTE_BITMAP_CL_SLAB_MASK; + return 1; + } + } + + return 0; +} + +/** + * Bitmap scan (with automatic wrap-around) + * + * @param bmp + * Handle to bitmap instance + * @param pos + * When function call returns 1, pos contains the position of the next set + * bit, otherwise not modified + * @param slab + * When function call returns 1, slab contains the value of the entire 64-bit + * slab where the bit indicated by pos is located. Slabs are always 64-bit + * aligned, so the position of the first bit of the slab (this bit is not + * necessarily set) is pos / 64. Once a slab has been returned by the bitmap + * scan operation, the internal pointers of the bitmap are updated to point + * after this slab, so the same slab will not be returned again if it + * contains more than one bit which is set. When function call returns 0, + * slab is not modified. + * @return + * 0 if there is no bit set in the bitmap, 1 otherwise + */ +static inline int +rte_bitmap_scan(struct rte_bitmap *bmp, uint32_t *pos, uint64_t *slab) +{ + /* Return data from current array2 line if available */ + if (__rte_bitmap_scan_read(bmp, pos, slab)) { + return 1; + } + + /* Look for non-empty array2 line */ + if (__rte_bitmap_scan_search(bmp)) { + __rte_bitmap_scan_read_init(bmp); + __rte_bitmap_scan_read(bmp, pos, slab); + return 1; + } + + /* Empty bitmap */ + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_BITMAP_H__ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_branch_prediction.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_branch_prediction.h new file mode 100644 index 00000000..854ef9e5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_branch_prediction.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Branch Prediction Helpers in RTE + */ + +#ifndef _RTE_BRANCH_PREDICTION_H_ +#define _RTE_BRANCH_PREDICTION_H_ + +/** + * Check if a branch is likely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * likely to be taken. Example: + * + * if (likely(x > 1)) + * do_stuff(); + * + */ +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif /* likely */ + +/** + * Check if a branch is unlikely to be taken. + * + * This compiler builtin allows the developer to indicate if a branch is + * unlikely to be taken. Example: + * + * if (unlikely(x < 1)) + * do_stuff(); + * + */ +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* unlikely */ + +#endif /* _RTE_BRANCH_PREDICTION_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_bus.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_bus.h new file mode 100644 index 00000000..b7b5b084 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_bus.h @@ -0,0 +1,339 @@ +/*- + * BSD LICENSE + * + * Copyright 2016 NXP + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of NXP nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_BUS_H_ +#define _RTE_BUS_H_ + +/** + * @file + * + * DPDK device bus interface + * + * This file exposes API and interfaces for bus abstraction + * over the devices and drivers in EAL. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include + +/** Double linked list of buses */ +TAILQ_HEAD(rte_bus_list, rte_bus); + + +/** + * IOVA mapping mode. + * + * IOVA mapping mode is iommu programming mode of a device. + * That device (for example: IOMMU backed DMA device) based + * on rte_iova_mode will generate physical or virtual address. + * + */ +enum rte_iova_mode { + RTE_IOVA_DC = 0, /* Don't care mode */ + RTE_IOVA_PA = (1 << 0), /* DMA using physical address */ + RTE_IOVA_VA = (1 << 1) /* DMA using virtual address */ +}; + +/** + * Bus specific scan for devices attached on the bus. + * For each bus object, the scan would be responsible for finding devices and + * adding them to its private device list. + * + * A bus should mandatorily implement this method. + * + * @return + * 0 for successful scan + * <0 for unsuccessful scan with error value + */ +typedef int (*rte_bus_scan_t)(void); + +/** + * Implementation specific probe function which is responsible for linking + * devices on that bus with applicable drivers. + * + * This is called while iterating over each registered bus. + * + * @return + * 0 for successful probe + * !0 for any error while probing + */ +typedef int (*rte_bus_probe_t)(void); + +/** + * Device iterator to find a device on a bus. + * + * This function returns an rte_device if one of those held by the bus + * matches the data passed as parameter. + * + * If the comparison function returns zero this function should stop iterating + * over any more devices. To continue a search the device of a previous search + * can be passed via the start parameter. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to compare each device against. + * + * @param start + * starting point for the iteration + * + * @return + * The first device matching the data, NULL if none exists. + */ +typedef struct rte_device * +(*rte_bus_find_device_t)(const struct rte_device *start, rte_dev_cmp_t cmp, + const void *data); + +/** + * Implementation specific probe function which is responsible for linking + * devices on that bus with applicable drivers. + * + * @param dev + * Device pointer that was returned by a previous call to find_device. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_plug_t)(struct rte_device *dev); + +/** + * Implementation specific remove function which is responsible for unlinking + * devices on that bus from assigned driver. + * + * @param dev + * Device pointer that was returned by a previous call to find_device. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_unplug_t)(struct rte_device *dev); + +/** + * Bus specific parsing function. + * Validates the syntax used in the textual representation of a device, + * If the syntax is valid and ``addr`` is not NULL, writes the bus-specific + * device representation to ``addr``. + * + * @param[in] name + * device textual description + * + * @param[out] addr + * device information location address, into which parsed info + * should be written. If NULL, nothing should be written, which + * is not an error. + * + * @return + * 0 if parsing was successful. + * !0 for any error. + */ +typedef int (*rte_bus_parse_t)(const char *name, void *addr); + +/** + * Bus scan policies + */ +enum rte_bus_scan_mode { + RTE_BUS_SCAN_UNDEFINED, + RTE_BUS_SCAN_WHITELIST, + RTE_BUS_SCAN_BLACKLIST, +}; + +/** + * A structure used to configure bus operations. + */ +struct rte_bus_conf { + enum rte_bus_scan_mode scan_mode; /**< Scan policy. */ +}; + + +/** + * Get common iommu class of the all the devices on the bus. The bus may + * check that those devices are attached to iommu driver. + * If no devices are attached to the bus. The bus may return with don't care + * (_DC) value. + * Otherwise, The bus will return appropriate _pa or _va iova mode. + * + * @return + * enum rte_iova_mode value. + */ +typedef enum rte_iova_mode (*rte_bus_get_iommu_class_t)(void); + + +/** + * A structure describing a generic bus. + */ +struct rte_bus { + TAILQ_ENTRY(rte_bus) next; /**< Next bus object in linked list */ + const char *name; /**< Name of the bus */ + rte_bus_scan_t scan; /**< Scan for devices attached to bus */ + rte_bus_probe_t probe; /**< Probe devices on bus */ + rte_bus_find_device_t find_device; /**< Find a device on the bus */ + rte_bus_plug_t plug; /**< Probe single device for drivers */ + rte_bus_unplug_t unplug; /**< Remove single device from driver */ + rte_bus_parse_t parse; /**< Parse a device name */ + struct rte_bus_conf conf; /**< Bus configuration */ + rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */ + rte_dev_iterate_t dev_iterate; /**< Device iterator. */ +}; + +/** + * Register a Bus handler. + * + * @param bus + * A pointer to a rte_bus structure describing the bus + * to be registered. + */ +void rte_bus_register(struct rte_bus *bus); + +/** + * Unregister a Bus handler. + * + * @param bus + * A pointer to a rte_bus structure describing the bus + * to be unregistered. + */ +void rte_bus_unregister(struct rte_bus *bus); + +/** + * Scan all the buses. + * + * @return + * 0 in case of success in scanning all buses + * !0 in case of failure to scan + */ +int rte_bus_scan(void); + +/** + * For each device on the buses, perform a driver 'match' and call the + * driver-specific probe for device initialization. + * + * @return + * 0 for successful match/probe + * !0 otherwise + */ +int rte_bus_probe(void); + +/** + * Dump information of all the buses registered with EAL. + * + * @param f + * A valid and open output stream handle + */ +void rte_bus_dump(FILE *f); + +/** + * Bus comparison function. + * + * @param bus + * Bus under test. + * + * @param data + * Data to compare against. + * + * @return + * 0 if the bus matches the data. + * !0 if the bus does not match. + * <0 if ordering is possible and the bus is lower than the data. + * >0 if ordering is possible and the bus is greater than the data. + */ +typedef int (*rte_bus_cmp_t)(const struct rte_bus *bus, const void *data); + +/** + * Bus iterator to find a particular bus. + * + * This function compares each registered bus to find one that matches + * the data passed as parameter. + * + * If the comparison function returns zero this function will stop iterating + * over any more buses. To continue a search the bus of a previous search can + * be passed via the start parameter. + * + * @param start + * Starting point for the iteration. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to pass to comparison function. + * + * @return + * A pointer to a rte_bus structure or NULL in case no bus matches + */ +struct rte_bus *rte_bus_find(const struct rte_bus *start, rte_bus_cmp_t cmp, + const void *data); + +/** + * Find the registered bus for a particular device. + */ +struct rte_bus *rte_bus_find_by_device(const struct rte_device *dev); + +/** + * Find the registered bus for a given name. + */ +struct rte_bus *rte_bus_find_by_name(const char *busname); + + +/** + * Get the common iommu class of devices bound on to buses available in the + * system. The default mode is PA. + * + * @return + * enum rte_iova_mode value. + */ +enum rte_iova_mode rte_bus_get_iommu_class(void); + +/** + * Helper for Bus registration. + * The constructor has higher priority than PMD constructors. + */ +#define RTE_REGISTER_BUS(nm, bus) \ +RTE_INIT_PRIO(businitfn_ ##nm, BUS) \ +{\ + (bus).name = RTE_STR(nm);\ + rte_bus_register(&bus); \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_BUS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_class.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_class.h new file mode 100644 index 00000000..276c91e9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_class.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#ifndef _RTE_CLASS_H_ +#define _RTE_CLASS_H_ + +/** + * @file + * + * DPDK device class interface. + * + * This file describes the interface of the device class + * abstraction layer. + * + * A device class defines the type of function a device + * will be used for e.g.: Ethernet adapter (eth), + * cryptographic coprocessor (crypto), etc. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include + +/** Double linked list of classes */ +TAILQ_HEAD(rte_class_list, rte_class); + +/** + * A structure describing a generic device class. + */ +struct rte_class { + TAILQ_ENTRY(rte_class) next; /**< Next device class in linked list */ + const char *name; /**< Name of the class */ + rte_dev_iterate_t dev_iterate; /**< Device iterator. */ +}; + +/** + * Class comparison function. + * + * @param cls + * Class under test. + * + * @param data + * Data to compare against. + * + * @return + * 0 if the class matches the data. + * !0 if the class does not match. + * <0 if ordering is possible and the class is lower than the data. + * >0 if ordering is possible and the class is greater than the data. + */ +typedef int (*rte_class_cmp_t)(const struct rte_class *cls, const void *data); + +/** + * Class iterator to find a particular class. + * + * This function compares each registered class to find one that matches + * the data passed as parameter. + * + * If the comparison function returns zero this function will stop iterating + * over any more classes. To continue a search the class of a previous search + * can be passed via the start parameter. + * + * @param start + * Starting point for the iteration. + * + * @param cmp + * Comparison function. + * + * @param data + * Data to pass to comparison function. + * + * @return + * A pointer to a rte_class structure or NULL in case no class matches + */ +__rte_experimental +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data); + +/** + * Find the registered class for a given name. + */ +__rte_experimental +struct rte_class * +rte_class_find_by_name(const char *name); + +/** + * Register a Class handle. + * + * @param cls + * A pointer to a rte_class structure describing the class + * to be registered. + */ +__rte_experimental +void rte_class_register(struct rte_class *cls); + +/** + * Unregister a Class handle. + * + * @param cls + * A pointer to a rte_class structure describing the class + * to be unregistered. + */ +__rte_experimental +void rte_class_unregister(struct rte_class *cls); + +/** + * Helper for Class registration. + * The constructor has lower priority than Bus constructors. + * The constructor has higher priority than PMD constructors. + */ +#define RTE_REGISTER_CLASS(nm, cls) \ +RTE_INIT_PRIO(classinitfn_ ##nm, CLASS) \ +{\ + (cls).name = RTE_STR(nm); \ + rte_class_register(&cls); \ +} + +#define RTE_UNREGISTER_CLASS(nm, cls) \ +RTE_FINI_PRIO(classfinifn_ ##nm, CLASS) \ +{ \ + rte_class_unregister(&cls); \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CLASS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_common.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_common.h new file mode 100644 index 00000000..069c13ec --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_common.h @@ -0,0 +1,578 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_COMMON_H_ +#define _RTE_COMMON_H_ + +/** + * @file + * + * Generic, commonly-used macro and inline function definitions + * for DPDK. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include + +#include + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef asm +#define asm __asm__ +#endif + +/** C extension macro for environments lacking C11 features. */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L +#define RTE_STD_C11 __extension__ +#else +#define RTE_STD_C11 +#endif + +/** Define GCC_VERSION **/ +#ifdef RTE_TOOLCHAIN_GCC +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + \ + __GNUC_PATCHLEVEL__) +#endif + +#ifdef RTE_ARCH_STRICT_ALIGN +typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1))); +typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1))); +typedef uint16_t unaligned_uint16_t __attribute__ ((aligned(1))); +#else +typedef uint64_t unaligned_uint64_t; +typedef uint32_t unaligned_uint32_t; +typedef uint16_t unaligned_uint16_t; +#endif + +/** + * Force alignment + */ +#define __rte_aligned(a) __attribute__((__aligned__(a))) + +/** + * Force a structure to be packed + */ +#define __rte_packed __attribute__((__packed__)) + +/******* Macro to mark functions and fields scheduled for removal *****/ +#define __rte_deprecated __attribute__((__deprecated__)) + +/*********** Macros to eliminate unused variable warnings ********/ + +/** + * short definition to mark a function parameter unused + */ +#define __rte_unused __attribute__((__unused__)) + +/** + * definition to mark a variable or function parameter as used so + * as to avoid a compiler warning + */ +#define RTE_SET_USED(x) (void)(x) + +#define RTE_PRIORITY_LOG 101 +#define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_CLASS 120 +#define RTE_PRIORITY_LAST 65535 + +#define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio + +/** + * Run function before main() with high priority. + * + * @param func + * Constructor function. + * @param prio + * Priority number must be above 100. + * Lowest number is the first to run. + */ +#define RTE_INIT_PRIO(func, prio) \ +static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) + +/** + * Run function before main() with low priority. + * + * The constructor will be run after prioritized constructors. + * + * @param func + * Constructor function. + */ +#define RTE_INIT(func) \ + RTE_INIT_PRIO(func, LAST) + +/** + * Run after main() with low priority. + * + * @param func + * Destructor function name. + * @param prio + * Priority number must be above 100. + * Lowest number is the last to run. + */ +#define RTE_FINI_PRIO(func, prio) \ +static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) + +/** + * Run after main() with high priority. + * + * The destructor will be run *before* prioritized destructors. + * + * @param func + * Destructor function name. + */ +#define RTE_FINI(func) \ + RTE_FINI_PRIO(func, LAST) + +/** + * Force a function to be inlined + */ +#define __rte_always_inline inline __attribute__((always_inline)) + +/** + * Force a function to be noinlined + */ +#define __rte_noinline __attribute__((noinline)) + +/*********** Macros for pointer arithmetic ********/ + +/** + * add a byte-value offset to a pointer + */ +#define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x))) + +/** + * subtract a byte-value offset from a pointer + */ +#define RTE_PTR_SUB(ptr, x) ((void*)((uintptr_t)ptr - (x))) + +/** + * get the difference between two pointer values, i.e. how far apart + * in bytes are the locations they point two. It is assumed that + * ptr1 is greater than ptr2. + */ +#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) + +/*********** Macros/static functions for doing alignment ********/ + + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no higher than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_FLOOR(ptr, align) \ + ((typeof(ptr))RTE_ALIGN_FLOOR((uintptr_t)ptr, align)) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no + * bigger than the first parameter. Second parameter must be a + * power-of-two value. + */ +#define RTE_ALIGN_FLOOR(val, align) \ + (typeof(val))((val) & (~((typeof(val))((align) - 1)))) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + */ +#define RTE_PTR_ALIGN_CEIL(ptr, align) \ + RTE_PTR_ALIGN_FLOOR((typeof(ptr))RTE_PTR_ADD(ptr, (align) - 1), align) + +/** + * Macro to align a value to a given power-of-two. The resultant value + * will be of the same type as the first parameter, and will be no lower + * than the first parameter. Second parameter must be a power-of-two + * value. + */ +#define RTE_ALIGN_CEIL(val, align) \ + RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align) + +/** + * Macro to align a pointer to a given power-of-two. The resultant + * pointer will be a pointer of the same type as the first parameter, and + * point to an address no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_PTR_ALIGN_CEIL + */ +#define RTE_PTR_ALIGN(ptr, align) RTE_PTR_ALIGN_CEIL(ptr, align) + +/** + * Macro to align a value to a given power-of-two. The resultant + * value will be of the same type as the first parameter, and + * will be no lower than the first parameter. Second parameter + * must be a power-of-two value. + * This function is the same as RTE_ALIGN_CEIL + */ +#define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no lower + * than the first parameter. + */ +#define RTE_ALIGN_MUL_CEIL(v, mul) \ + (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no higher + * than the first parameter. + */ +#define RTE_ALIGN_MUL_FLOOR(v, mul) \ + ((v / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Checks if a pointer is aligned to a given power-of-two value + * + * @param ptr + * The pointer whose alignment is to be checked + * @param align + * The power-of-two value to which the ptr should be aligned + * + * @return + * True(1) where the pointer is correctly aligned, false(0) otherwise + */ +static inline int +rte_is_aligned(void *ptr, unsigned align) +{ + return RTE_PTR_ALIGN(ptr, align) == ptr; +} + +/*********** Macros for compile type checks ********/ + +/** + * Triggers an error at compilation time if the condition is true. + */ +#ifndef __OPTIMIZE__ +#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#else +extern int RTE_BUILD_BUG_ON_detected_error; +#define RTE_BUILD_BUG_ON(condition) do { \ + ((void)sizeof(char[1 - 2*!!(condition)])); \ + if (condition) \ + RTE_BUILD_BUG_ON_detected_error = 1; \ +} while(0) +#endif + +/** + * Combines 32b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param x + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint32_t +rte_combine32ms1b(register uint32_t x) +{ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x; +} + +/** + * Combines 64b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param v + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint64_t +rte_combine64ms1b(register uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + + return v; +} + +/*********** Macros to work with powers of 2 ********/ + +/** + * Macro to return 1 if n is a power of 2, 0 otherwise + */ +#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n))) + +/** + * Returns true if n is a power of 2 + * @param n + * Number to check + * @return 1 if true, 0 otherwise + */ +static inline int +rte_is_power_of_2(uint32_t n) +{ + return n && !(n & (n - 1)); +} + +/** + * Aligns input parameter to the next power of 2 + * + * @param x + * The integer value to algin + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint32_t +rte_align32pow2(uint32_t x) +{ + x--; + x = rte_combine32ms1b(x); + + return x + 1; +} + +/** + * Aligns input parameter to the previous power of 2 + * + * @param x + * The integer value to algin + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint32_t +rte_align32prevpow2(uint32_t x) +{ + x = rte_combine32ms1b(x); + + return x - (x >> 1); +} + +/** + * Aligns 64b input parameter to the next power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the next power of 2 + */ +static inline uint64_t +rte_align64pow2(uint64_t v) +{ + v--; + v = rte_combine64ms1b(v); + + return v + 1; +} + +/** + * Aligns 64b input parameter to the previous power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint64_t +rte_align64prevpow2(uint64_t v) +{ + v = rte_combine64ms1b(v); + + return v - (v >> 1); +} + +/*********** Macros for calculating min and max **********/ + +/** + * Macro to return the minimum of two numbers + */ +#define RTE_MIN(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +/** + * Macro to return the maximum of two numbers + */ +#define RTE_MAX(a, b) \ + __extension__ ({ \ + typeof (a) _a = (a); \ + typeof (b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +/*********** Other general functions / macros ********/ + +/** + * Searches the input parameter for the least significant set bit + * (starting from zero). + * If a least significant 1 bit is found, its bit index is returned. + * If the content of the input parameter is zero, then the content of the return + * value is undefined. + * @param v + * input parameter, should not be zero. + * @return + * least significant set bit in the input parameter. + */ +static inline uint32_t +rte_bsf32(uint32_t v) +{ + return (uint32_t)__builtin_ctz(v); +} + +/** + * Return the rounded-up log2 of a integer. + * + * @param v + * The input parameter. + * @return + * The rounded-up log2 of the input, or 0 if the input is 0. + */ +static inline uint32_t +rte_log2_u32(uint32_t v) +{ + if (v == 0) + return 0; + v = rte_align32pow2(v); + return rte_bsf32(v); +} + +#ifndef offsetof +/** Return the offset of a field in a structure. */ +#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) +#endif + +/** + * Return pointer to the wrapping struct instance. + * + * Example: + * + * struct wrapper { + * ... + * struct child c; + * ... + * }; + * + * struct child *x = obtain(...); + * struct wrapper *w = container_of(x, struct wrapper, c); + */ +#ifndef container_of +#define container_of(ptr, type, member) __extension__ ({ \ + const typeof(((type *)0)->member) *_ptr = (ptr); \ + __attribute__((unused)) type *_target_ptr = \ + (type *)(ptr); \ + (type *)(((uintptr_t)_ptr) - offsetof(type, member)); \ + }) +#endif + +#define _RTE_STR(x) #x +/** Take a macro value and get a string version of it */ +#define RTE_STR(x) _RTE_STR(x) + +/** + * ISO C helpers to modify format strings using variadic macros. + * This is a replacement for the ", ## __VA_ARGS__" GNU extension. + * An empty %s argument is appended to avoid a dangling comma. + */ +#define RTE_FMT(fmt, ...) fmt "%.0s", __VA_ARGS__ "" +#define RTE_FMT_HEAD(fmt, ...) fmt +#define RTE_FMT_TAIL(fmt, ...) __VA_ARGS__ + +/** Mask value of type "tp" for the first "ln" bit set. */ +#define RTE_LEN2MASK(ln, tp) \ + ((tp)((uint64_t)-1 >> (sizeof(uint64_t) * CHAR_BIT - (ln)))) + +/** Number of elements in the array. */ +#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0])) + +/** + * Converts a numeric string to the equivalent uint64_t value. + * As well as straight number conversion, also recognises the suffixes + * k, m and g for kilobytes, megabytes and gigabytes respectively. + * + * If a negative number is passed in i.e. a string with the first non-black + * character being "-", zero is returned. Zero is also returned in the case of + * an error with the strtoull call in the function. + * + * @param str + * String containing number to convert. + * @return + * Number. + */ +static inline uint64_t +rte_str_to_size(const char *str) +{ + char *endptr; + unsigned long long size; + + while (isspace((int)*str)) + str++; + if (*str == '-') + return 0; + + errno = 0; + size = strtoull(str, &endptr, 0); + if (errno) + return 0; + + if (*endptr == ' ') + endptr++; /* allow 1 space gap */ + + switch (*endptr){ + case 'G': case 'g': size *= 1024; /* fall-through */ + case 'M': case 'm': size *= 1024; /* fall-through */ + case 'K': case 'k': size *= 1024; /* fall-through */ + default: + break; + } + return size; +} + +/** + * Function to terminate the application immediately, printing an error + * message and returning the exit_code back to the shell. + * + * This function never returns + * + * @param exit_code + * The exit code to be returned by the application + * @param format + * The format string to be used for printing the message. This can include + * printf format characters which will be expanded using any further parameters + * to the function. + */ +void +rte_exit(int exit_code, const char *format, ...) + __attribute__((noreturn)) + __attribute__((format(printf, 2, 3))); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_debug.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_debug.h new file mode 100644 index 00000000..272df494 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_debug.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_DEBUG_H_ +#define _RTE_DEBUG_H_ + +/** + * @file + * + * Debug Functions in RTE + * + * This file defines a generic API for debug operations. Part of + * the implementation is architecture-specific. + */ + +#include "rte_log.h" +#include "rte_branch_prediction.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Dump the stack of the calling core to the console. + */ +void rte_dump_stack(void); + +/** + * Dump the registers of the calling core to the console. + * + * Note: Not implemented in a userapp environment; use gdb instead. + */ +void rte_dump_registers(void); + +/** + * Provide notification of a critical non-recoverable error and terminate + * execution abnormally. + * + * Display the format string and its expanded arguments (printf-like). + * + * In a linuxapp environment, this function dumps the stack and calls + * abort() resulting in a core dump if enabled. + * + * The function never returns. + * + * @param ... + * The format string, followed by the variable list of arguments. + */ +#define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy") +#define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__) + +#ifdef RTE_ENABLE_ASSERT +#define RTE_ASSERT(exp) RTE_VERIFY(exp) +#else +#define RTE_ASSERT(exp) do {} while (0) +#endif +#define RTE_VERIFY(exp) do { \ + if (unlikely(!(exp))) \ + rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \ +} while (0) + +/* + * Provide notification of a critical non-recoverable error and stop. + * + * This function should not be called directly. Refer to rte_panic() macro + * documentation. + */ +void __rte_panic(const char *funcname , const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __attribute__((cold)) +#endif +#endif + __attribute__((noreturn)) + __attribute__((format(printf, 2, 3))); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEBUG_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_dev.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_dev.h new file mode 100644 index 00000000..b80a8059 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_dev.h @@ -0,0 +1,463 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2014 6WIND S.A. + */ + +#ifndef _RTE_DEV_H_ +#define _RTE_DEV_H_ + +/** + * @file + * + * RTE PMD Driver Registration Interface + * + * This file manages the list of device drivers. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include +#include + +/** + * The device event type. + */ +enum rte_dev_event_type { + RTE_DEV_EVENT_ADD, /**< device being added */ + RTE_DEV_EVENT_REMOVE, /**< device being removed */ + RTE_DEV_EVENT_MAX /**< max value of this enum */ +}; + +struct rte_dev_event { + enum rte_dev_event_type type; /**< device event type */ + int subsystem; /**< subsystem id */ + char *devname; /**< device name */ +}; + +typedef void (*rte_dev_event_cb_fn)(char *device_name, + enum rte_dev_event_type event, + void *cb_arg); + +__attribute__((format(printf, 2, 0))) +static inline void +rte_pmd_debug_trace(const char *func_name, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + { + char buffer[vsnprintf(NULL, 0, fmt, ap) + 1]; + + va_end(ap); + + va_start(ap, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, ap); + va_end(ap); + + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_PMD, "%s: %s", + func_name, buffer); + } +} + +/* + * Enable RTE_PMD_DEBUG_TRACE() when at least one component relying on the + * RTE_*_RET() macros defined below is compiled in debug mode. + */ +#if defined(RTE_LIBRTE_EVENTDEV_DEBUG) +#define RTE_PMD_DEBUG_TRACE(...) \ + rte_pmd_debug_trace(__func__, __VA_ARGS__) +#else +#define RTE_PMD_DEBUG_TRACE(...) (void)0 +#endif + +/* Macros for checking for restricting functions to primary instance only */ +#define RTE_PROC_PRIMARY_OR_ERR_RET(retval) do { \ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { \ + RTE_PMD_DEBUG_TRACE("Cannot run in secondary processes\n"); \ + return retval; \ + } \ +} while (0) + +#define RTE_PROC_PRIMARY_OR_RET() do { \ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { \ + RTE_PMD_DEBUG_TRACE("Cannot run in secondary processes\n"); \ + return; \ + } \ +} while (0) + +/* Macros to check for invalid function pointers */ +#define RTE_FUNC_PTR_OR_ERR_RET(func, retval) do { \ + if ((func) == NULL) { \ + RTE_PMD_DEBUG_TRACE("Function not supported\n"); \ + return retval; \ + } \ +} while (0) + +#define RTE_FUNC_PTR_OR_RET(func) do { \ + if ((func) == NULL) { \ + RTE_PMD_DEBUG_TRACE("Function not supported\n"); \ + return; \ + } \ +} while (0) + +/** + * Device driver. + */ +enum rte_kernel_driver { + RTE_KDRV_UNKNOWN = 0, + RTE_KDRV_IGB_UIO, + RTE_KDRV_VFIO, + RTE_KDRV_UIO_GENERIC, + RTE_KDRV_NIC_UIO, + RTE_KDRV_NONE, +}; + +/** + * Device policies. + */ +enum rte_dev_policy { + RTE_DEV_WHITELISTED, + RTE_DEV_BLACKLISTED, +}; + +/** + * A generic memory resource representation. + */ +struct rte_mem_resource { + uint64_t phys_addr; /**< Physical address, 0 if not resource. */ + uint64_t len; /**< Length of the resource. */ + void *addr; /**< Virtual address, NULL when not mapped. */ +}; + +/** + * A structure describing a device driver. + */ +struct rte_driver { + TAILQ_ENTRY(rte_driver) next; /**< Next in list. */ + const char *name; /**< Driver name. */ + const char *alias; /**< Driver alias. */ +}; + +/* + * Internal identifier length + * Sufficiently large to allow for UUID or PCI address + */ +#define RTE_DEV_NAME_MAX_LEN 64 + +/** + * A structure describing a generic device. + */ +struct rte_device { + TAILQ_ENTRY(rte_device) next; /**< Next device */ + const char *name; /**< Device name */ + const struct rte_driver *driver;/**< Associated driver */ + int numa_node; /**< NUMA node connection */ + struct rte_devargs *devargs; /**< Device user arguments */ +}; + +/** + * Attach a device to a registered driver. + * + * @param name + * The device name, that refers to a pci device (or some private + * way of designating a vdev device). Based on this device name, eal + * will identify a driver capable of handling it and pass it to the + * driver probing function. + * @param devargs + * Device arguments to be passed to the driver. + * @return + * 0 on success, negative on error. + */ +__rte_deprecated +int rte_eal_dev_attach(const char *name, const char *devargs); + +/** + * Detach a device from its driver. + * + * @param dev + * A pointer to a rte_device structure. + * @return + * 0 on success, negative on error. + */ +__rte_deprecated +int rte_eal_dev_detach(struct rte_device *dev); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Hotplug add a given device to a specific bus. + * + * @param busname + * The bus name the device is added to. + * @param devname + * The device name. Based on this device name, eal will identify a driver + * capable of handling it and pass it to the driver probing function. + * @param devargs + * Device arguments to be passed to the driver. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, + const char *devargs); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Hotplug remove a given device from a specific bus. + * + * @param busname + * The bus name the device is removed from. + * @param devname + * The device name being removed. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_eal_hotplug_remove(const char *busname, + const char *devname); + +/** + * Device comparison function. + * + * This type of function is used to compare an rte_device with arbitrary + * data. + * + * @param dev + * Device handle. + * + * @param data + * Data to compare against. The type of this parameter is determined by + * the kind of comparison performed by the function. + * + * @return + * 0 if the device matches the data. + * !0 if the device does not match. + * <0 if ordering is possible and the device is lower than the data. + * >0 if ordering is possible and the device is greater than the data. + */ +typedef int (*rte_dev_cmp_t)(const struct rte_device *dev, const void *data); + +#define RTE_PMD_EXPORT_NAME_ARRAY(n, idx) n##idx[] + +#define RTE_PMD_EXPORT_NAME(name, idx) \ +static const char RTE_PMD_EXPORT_NAME_ARRAY(this_pmd_name, idx) \ +__attribute__((used)) = RTE_STR(name) + +#define DRV_EXP_TAG(name, tag) __##name##_##tag + +#define RTE_PMD_REGISTER_PCI_TABLE(name, table) \ +static const char DRV_EXP_TAG(name, pci_tbl_export)[] __attribute__((used)) = \ +RTE_STR(table) + +#define RTE_PMD_REGISTER_PARAM_STRING(name, str) \ +static const char DRV_EXP_TAG(name, param_string_export)[] \ +__attribute__((used)) = str + +/** + * Advertise the list of kernel modules required to run this driver + * + * This string lists the kernel modules required for the devices + * associated to a PMD. The format of each line of the string is: + * " ". + * + * The possible formats for the device pattern are: + * "*" all devices supported by this driver + * "pci:*" all PCI devices supported by this driver + * "pci:v8086:d*:sv*:sd*" all PCI devices supported by this driver + * whose vendor id is 0x8086. + * + * The format of the kernel modules list is a parenthesed expression + * containing logical-and (&) and logical-or (|). + * + * The device pattern and the kmod expression are separated by a space. + * + * Example: + * - "* igb_uio | uio_pci_generic | vfio" + */ +#define RTE_PMD_REGISTER_KMOD_DEP(name, str) \ +static const char DRV_EXP_TAG(name, kmod_dep_export)[] \ +__attribute__((used)) = str + +/** + * Iteration context. + * + * This context carries over the current iteration state. + */ +struct rte_dev_iterator { + const char *dev_str; /**< device string. */ + const char *bus_str; /**< bus-related part of device string. */ + const char *cls_str; /**< class-related part of device string. */ + struct rte_bus *bus; /**< bus handle. */ + struct rte_class *cls; /**< class handle. */ + struct rte_device *device; /**< current position. */ + void *class_device; /**< additional specialized context. */ +}; + +/** + * Device iteration function. + * + * Find the next device matching properties passed in parameters. + * The function takes an additional ``start`` parameter, that is + * used as starting context when relevant. + * + * The function returns the current element in the iteration. + * This return value will potentially be used as a start parameter + * in subsequent calls to the function. + * + * The additional iterator parameter is only there if a specific + * implementation needs additional context. It must not be modified by + * the iteration function itself. + * + * @param start + * Starting iteration context. + * + * @param devstr + * Device description string. + * + * @param it + * Device iterator. + * + * @return + * The address of the current element matching the device description + * string. + */ +typedef void *(*rte_dev_iterate_t)(const void *start, + const char *devstr, + const struct rte_dev_iterator *it); + +/** + * Initializes a device iterator. + * + * This iterator allows accessing a list of devices matching a criteria. + * The device matching is made among all buses and classes currently registered, + * filtered by the device description given as parameter. + * + * This function will not allocate any memory. It is safe to stop the + * iteration at any moment and let the iterator go out of context. + * + * @param it + * Device iterator handle. + * + * @param str + * Device description string. + * + * @return + * 0 on successful initialization. + * <0 on error. + */ +__rte_experimental +int +rte_dev_iterator_init(struct rte_dev_iterator *it, const char *str); + +/** + * Iterates on a device iterator. + * + * Generates a new rte_device handle corresponding to the next element + * in the list described in comprehension by the iterator. + * + * The next object is returned, and the iterator is updated. + * + * @param it + * Device iterator handle. + * + * @return + * An rte_device handle if found. + * NULL if an error occurred (rte_errno is set). + * NULL if no device could be found (rte_errno is not set). + */ +__rte_experimental +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it); + +#define RTE_DEV_FOREACH(dev, devstr, it) \ + for (rte_dev_iterator_init(it, devstr), \ + dev = rte_dev_iterator_next(it); \ + dev != NULL; \ + dev = rte_dev_iterator_next(it)) + +#ifdef __cplusplus +} +#endif + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It registers the callback for the specific device. + * Multiple callbacks cal be registered at the same time. + * + * @param device_name + * The device name, that is the param name of the struct rte_device, + * null value means for all devices. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It unregisters the callback according to the specified device. + * + * @param device_name + * The device name, that is the param name of the struct rte_device, + * null value means for all devices and their callbacks. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback, (void *)-1 means to remove all + * registered which has the same callback address. + * + * @return + * - On success, return the number of callback entities removed. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start the device event monitoring. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_monitor_start(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop the device event monitoring. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_monitor_stop(void); + +#endif /* _RTE_DEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_devargs.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_devargs.h new file mode 100644 index 00000000..097a4ce7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_devargs.h @@ -0,0 +1,319 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2014 6WIND S.A. + */ + +#ifndef _RTE_DEVARGS_H_ +#define _RTE_DEVARGS_H_ + +/** + * @file + * + * RTE devargs: list of devices and their user arguments + * + * This file stores a list of devices and their arguments given by + * the user when a DPDK application is started. These devices can be PCI + * devices or virtual devices. These devices are stored at startup in a + * list of rte_devargs structures. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/** + * Type of generic device + */ +enum rte_devtype { + RTE_DEVTYPE_WHITELISTED_PCI, + RTE_DEVTYPE_BLACKLISTED_PCI, + RTE_DEVTYPE_VIRTUAL, +}; + +/** + * Structure that stores a device given by the user with its arguments + * + * A user device is a physical or a virtual device given by the user to + * the DPDK application at startup through command line arguments. + * + * The structure stores the configuration of the device, its PCI + * identifier if it's a PCI device or the driver name if it's a virtual + * device. + */ +struct rte_devargs { + /** Next in list. */ + TAILQ_ENTRY(rte_devargs) next; + /** Type of device. */ + enum rte_devtype type; + /** Device policy. */ + enum rte_dev_policy policy; + /** Name of the device. */ + char name[RTE_DEV_NAME_MAX_LEN]; + RTE_STD_C11 + union { + /** Arguments string as given by user or "" for no argument. */ + char *args; + const char *drv_str; + }; + struct rte_bus *bus; /**< bus handle. */ + struct rte_class *cls; /**< class handle. */ + const char *bus_str; /**< bus-related part of device string. */ + const char *cls_str; /**< class-related part of device string. */ + const char *data; /**< Device string storage. */ +}; + +/** + * @deprecated + * Parse a devargs string. + * + * For PCI devices, the format of arguments string is "PCI_ADDR" or + * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0", + * "04:00.0,arg=val". + * + * For virtual devices, the format of arguments string is "DRIVER_NAME*" + * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring", + * "net_ring0", "net_pmdAnything,arg=0:arg2=1". + * + * The function parses the arguments string to get driver name and driver + * arguments. + * + * @param devargs_str + * The arguments as given by the user. + * @param drvname + * The pointer to the string to store parsed driver name. + * @param drvargs + * The pointer to the string to store parsed driver arguments. + * + * @return + * - 0 on success + * - A negative value on error + */ +__rte_deprecated +int rte_eal_parse_devargs_str(const char *devargs_str, + char **drvname, char **drvargs); + +/** + * Parse a device string. + * + * Verify that a bus is capable of handling the device passed + * in argument. Store which bus will handle the device, its name + * and the eventual device parameters. + * + * The syntax is: + * + * bus:device_identifier,arg1=val1,arg2=val2 + * + * where "bus:" is the bus name followed by any character separator. + * The bus name is optional. If no bus name is specified, each bus + * will attempt to recognize the device identifier. The first one + * to succeed will be used. + * + * Examples: + * + * pci:0000:05.00.0,arg=val + * 05.00.0,arg=val + * vdev:net_ring0 + * + * @param da + * The devargs structure holding the device information. + * + * @param dev + * String describing a device. + * + * @return + * - 0 on success. + * - Negative errno on error. + */ +__rte_experimental +int +rte_devargs_parse(struct rte_devargs *da, const char *dev); + +/** + * Parse a device string. + * + * Verify that a bus is capable of handling the device passed + * in argument. Store which bus will handle the device, its name + * and the eventual device parameters. + * + * The device string is built with a printf-like syntax. + * + * The syntax is: + * + * bus:device_identifier,arg1=val1,arg2=val2 + * + * where "bus:" is the bus name followed by any character separator. + * The bus name is optional. If no bus name is specified, each bus + * will attempt to recognize the device identifier. The first one + * to succeed will be used. + * + * Examples: + * + * pci:0000:05.00.0,arg=val + * 05.00.0,arg=val + * vdev:net_ring0 + * + * @param da + * The devargs structure holding the device information. + * @param format + * Format string describing a device. + * + * @return + * - 0 on success. + * - Negative errno on error. + */ +__rte_experimental +int +rte_devargs_parsef(struct rte_devargs *da, + const char *format, ...) +__attribute__((format(printf, 2, 0))); + +/** + * Insert an rte_devargs in the global list. + * + * @param da + * The devargs structure to insert. + * + * @return + * - 0 on success + * - Negative on error. + */ +__rte_experimental +int +rte_devargs_insert(struct rte_devargs *da); + +/** + * Add a device to the user device list + * See rte_devargs_parse() for details. + * + * @param devtype + * The type of the device. + * @param devargs_str + * The arguments as given by the user. + * + * @return + * - 0 on success + * - A negative value on error + */ +__rte_experimental +int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str); + +/** + * @deprecated + * Add a device to the user device list + * See rte_devargs_parse() for details. + * + * @param devtype + * The type of the device. + * @param devargs_str + * The arguments as given by the user. + * + * @return + * - 0 on success + * - A negative value on error + */ +__rte_deprecated +int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); + +/** + * Remove a device from the user device list. + * Its resources are freed. + * If the devargs cannot be found, nothing happens. + * + * @param busname + * bus name of the devargs to remove. + * + * @param devname + * device name of the devargs to remove. + * + * @return + * 0 on success. + * <0 on error. + * >0 if the devargs was not within the user device list. + */ +__rte_experimental +int rte_devargs_remove(const char *busname, + const char *devname); + +/** + * Count the number of user devices of a specified type + * + * @param devtype + * The type of the devices to counted. + * + * @return + * The number of devices. + */ +__rte_experimental +unsigned int +rte_devargs_type_count(enum rte_devtype devtype); + +/** + * @deprecated + * Count the number of user devices of a specified type + * + * @param devtype + * The type of the devices to counted. + * + * @return + * The number of devices. + */ +__rte_deprecated +unsigned int +rte_eal_devargs_type_count(enum rte_devtype devtype); + +/** + * This function dumps the list of user device and their arguments. + * + * @param f + * A pointer to a file for output + */ +__rte_experimental +void rte_devargs_dump(FILE *f); + +/** + * @deprecated + * This function dumps the list of user device and their arguments. + * + * @param f + * A pointer to a file for output + */ +__rte_deprecated +void rte_eal_devargs_dump(FILE *f); + +/** + * Find next rte_devargs matching the provided bus name. + * + * @param busname + * Limit the iteration to devargs related to buses + * matching this name. + * Will return any next rte_devargs if NULL. + * + * @param start + * Starting iteration point. The iteration will start at + * the first rte_devargs if NULL. + * + * @return + * Next rte_devargs entry matching the requested bus, + * NULL if there is none. + */ +__rte_experimental +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start); + +/** + * Iterate over all rte_devargs for a specific bus. + */ +#define RTE_EAL_DEVARGS_FOREACH(busname, da) \ + for (da = rte_devargs_next(busname, NULL); \ + da != NULL; \ + da = rte_devargs_next(busname, da)) \ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_DEVARGS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal.h new file mode 100644 index 00000000..e114dcbd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal.h @@ -0,0 +1,505 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _RTE_EAL_H_ +#define _RTE_EAL_H_ + +/** + * @file + * + * EAL Configuration API + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_MAGIC 19820526 /**< Magic number written by the main partition when ready. */ + +/* Maximum thread_name length. */ +#define RTE_MAX_THREAD_NAME_LEN 16 + +/** + * The lcore role (used in RTE or not). + */ +enum rte_lcore_role_t { + ROLE_RTE, + ROLE_OFF, + ROLE_SERVICE, +}; + +/** + * The type of process in a linuxapp, multi-process setup + */ +enum rte_proc_type_t { + RTE_PROC_AUTO = -1, /* allow auto-detection of primary/secondary */ + RTE_PROC_PRIMARY = 0, /* set to zero, so primary is the default */ + RTE_PROC_SECONDARY, + + RTE_PROC_INVALID +}; + +/** + * The global RTE configuration structure. + */ +struct rte_config { + uint32_t master_lcore; /**< Id of the master lcore */ + uint32_t lcore_count; /**< Number of available logical cores. */ + uint32_t numa_node_count; /**< Number of detected NUMA nodes. */ + uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected NUMA nodes. */ + uint32_t service_lcore_count;/**< Number of available service cores. */ + enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */ + + /** Primary or secondary configuration */ + enum rte_proc_type_t process_type; + + /** PA or VA mapping mode */ + enum rte_iova_mode iova_mode; + + /** + * Pointer to memory configuration, which may be shared across multiple + * DPDK instances + */ + struct rte_mem_config *mem_config; +} __attribute__((__packed__)); + +/** + * Get the global configuration structure. + * + * @return + * A pointer to the global configuration structure. + */ +struct rte_config *rte_eal_get_configuration(void); + +/** + * Get a lcore's role. + * + * @param lcore_id + * The identifier of the lcore. + * @return + * The role of the lcore. + */ +enum rte_lcore_role_t rte_eal_lcore_role(unsigned lcore_id); + + +/** + * Get the process type in a multi-process setup + * + * @return + * The process type + */ +enum rte_proc_type_t rte_eal_process_type(void); + +/** + * Request iopl privilege for all RPL. + * + * This function should be called by pmds which need access to ioports. + + * @return + * - On success, returns 0. + * - On failure, returns -1. + */ +int rte_eal_iopl_init(void); + +/** + * Initialize the Environment Abstraction Layer (EAL). + * + * This function is to be executed on the MASTER lcore only, as soon + * as possible in the application's main() function. + * + * The function finishes the initialization process before main() is called. + * It puts the SLAVE lcores in the WAIT state. + * + * When the multi-partition feature is supported, depending on the + * configuration (if CONFIG_RTE_EAL_MAIN_PARTITION is disabled), this + * function waits to ensure that the magic number is set before + * returning. See also the rte_eal_get_configuration() function. Note: + * This behavior may change in the future. + * + * @param argc + * A non-negative value. If it is greater than 0, the array members + * for argv[0] through argv[argc] (non-inclusive) shall contain pointers + * to strings. + * @param argv + * An array of strings. The contents of the array, as well as the strings + * which are pointed to by the array, may be modified by this function. + * @return + * - On success, the number of parsed arguments, which is greater or + * equal to zero. After the call to rte_eal_init(), + * all arguments argv[x] with x < ret may have been modified by this + * function call and should not be further interpreted by the + * application. The EAL does not take any ownership of the memory used + * for either the argv array, or its members. + * - On failure, -1 and rte_errno is set to a value indicating the cause + * for failure. In some instances, the application will need to be + * restarted as part of clearing the issue. + * + * Error codes returned via rte_errno: + * EACCES indicates a permissions issue. + * + * EAGAIN indicates either a bus or system resource was not available, + * setup may be attempted again. + * + * EALREADY indicates that the rte_eal_init function has already been + * called, and cannot be called again. + * + * EFAULT indicates the tailq configuration name was not found in + * memory configuration. + * + * EINVAL indicates invalid parameters were passed as argv/argc. + * + * ENOMEM indicates failure likely caused by an out-of-memory condition. + * + * ENODEV indicates memory setup issues. + * + * ENOTSUP indicates that the EAL cannot initialize on this system. + * + * EPROTO indicates that the PCI bus is either not present, or is not + * readable by the eal. + * + * ENOEXEC indicates that a service core failed to launch successfully. + */ +int rte_eal_init(int argc, char **argv); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Clean up the Environment Abstraction Layer (EAL) + * + * This function must be called to release any internal resources that EAL has + * allocated during rte_eal_init(). After this call, no DPDK function calls may + * be made. It is expected that common usage of this function is to call it + * just before terminating the process. + * + * @return 0 Successfully released all internal EAL resources + * @return -EFAULT There was an error in releasing all resources. + */ +int __rte_experimental rte_eal_cleanup(void); + +/** + * Check if a primary process is currently alive + * + * This function returns true when a primary process is currently + * active. + * + * @param config_file_path + * The config_file_path argument provided should point at the location + * that the primary process will create its config file. If NULL, the default + * config file path is used. + * + * @return + * - If alive, returns 1. + * - If dead, returns 0. + */ +int rte_eal_primary_proc_alive(const char *config_file_path); + +#define RTE_MP_MAX_FD_NUM 8 /* The max amount of fds */ +#define RTE_MP_MAX_NAME_LEN 64 /* The max length of action name */ +#define RTE_MP_MAX_PARAM_LEN 256 /* The max length of param */ +struct rte_mp_msg { + char name[RTE_MP_MAX_NAME_LEN]; + int len_param; + int num_fds; + uint8_t param[RTE_MP_MAX_PARAM_LEN]; + int fds[RTE_MP_MAX_FD_NUM]; +}; + +struct rte_mp_reply { + int nb_sent; + int nb_received; + struct rte_mp_msg *msgs; /* caller to free */ +}; + +/** + * Action function typedef used by other components. + * + * As we create socket channel for primary/secondary communication, use + * this function typedef to register action for coming messages. + */ +typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer); + +/** + * Asynchronous reply function typedef used by other components. + * + * As we create socket channel for primary/secondary communication, use + * this function typedef to register action for coming responses to asynchronous + * requests. + */ +typedef int (*rte_mp_async_reply_t)(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register an action function for primary/secondary communication. + * + * Call this function to register an action, if the calling component wants + * to response the messages from the corresponding component in its primary + * process or secondary processes. + * + * @param name + * The name argument plays as the nonredundant key to find the action. + * + * @param action + * The action argument is the function pointer to the action function. + * + * @return + * - 0 on success. + * - (<0) on failure. + */ +int __rte_experimental +rte_mp_action_register(const char *name, rte_mp_t action); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Unregister an action function for primary/secondary communication. + * + * Call this function to unregister an action if the calling component does + * not want to response the messages from the corresponding component in its + * primary process or secondary processes. + * + * @param name + * The name argument plays as the nonredundant key to find the action. + * + */ +void __rte_experimental +rte_mp_action_unregister(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a message to the peer process. + * + * This function will send a message which will be responsed by the action + * identified by name in the peer process. + * + * @param msg + * The msg argument contains the customized message. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +int __rte_experimental +rte_mp_sendmsg(struct rte_mp_msg *msg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a request to the peer process and expect a reply. + * + * This function sends a request message to the peer process, and will + * block until receiving reply message from the peer process. + * + * @note The caller is responsible to free reply->replies. + * + * @param req + * The req argument contains the customized request message. + * + * @param reply + * The reply argument will be for storing all the replied messages; + * the caller is responsible for free reply->replies. + * + * @param ts + * The ts argument specifies how long we can wait for the peer(s) to reply. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +int __rte_experimental +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, + const struct timespec *ts); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a request to the peer process and expect a reply in a separate callback. + * + * This function sends a request message to the peer process, and will not + * block. Instead, reply will be received in a separate callback. + * + * @param req + * The req argument contains the customized request message. + * + * @param ts + * The ts argument specifies how long we can wait for the peer(s) to reply. + * + * @param clb + * The callback to trigger when all responses for this request have arrived. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +int __rte_experimental +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a reply to the peer process. + * + * This function will send a reply message in response to a request message + * received previously. + * + * @param msg + * The msg argument contains the customized message. + * + * @param peer + * The peer argument is the pointer to the peer socket path. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +int __rte_experimental +rte_mp_reply(struct rte_mp_msg *msg, const char *peer); + +/** + * Usage function typedef used by the application usage function. + * + * Use this function typedef to define and call rte_set_application_usage_hook() + * routine. + */ +typedef void (*rte_usage_hook_t)(const char * prgname); + +/** + * Add application usage routine callout from the eal_usage() routine. + * + * This function allows the application to include its usage message + * in the EAL system usage message. The routine rte_set_application_usage_hook() + * needs to be called before the rte_eal_init() routine in the application. + * + * This routine is optional for the application and will behave as if the set + * routine was never called as the default behavior. + * + * @param usage_func + * The func argument is a function pointer to the application usage routine. + * Called function is defined using rte_usage_hook_t typedef, which is of + * the form void rte_usage_func(const char * prgname). + * + * Calling this routine with a NULL value will reset the usage hook routine and + * return the current value, which could be NULL. + * @return + * - Returns the current value of the rte_application_usage pointer to allow + * the caller to daisy chain the usage routines if needing more then one. + */ +rte_usage_hook_t +rte_set_application_usage_hook(rte_usage_hook_t usage_func); + +/** + * macro to get the lock of tailq in mem_config + */ +#define RTE_EAL_TAILQ_RWLOCK (&rte_eal_get_configuration()->mem_config->qlock) + +/** + * macro to get the multiple lock of mempool shared by mutiple-instance + */ +#define RTE_EAL_MEMPOOL_RWLOCK (&rte_eal_get_configuration()->mem_config->mplock) + +/** + * Whether EAL is using huge pages (disabled by --no-huge option). + * The no-huge mode cannot be used with UIO poll-mode drivers like igb/ixgbe. + * It is useful for NIC drivers (e.g. librte_pmd_mlx4, librte_pmd_vmxnet3) or + * crypto drivers (e.g. librte_crypto_nitrox) provided by third-parties such + * as 6WIND. + * + * @return + * Nonzero if hugepages are enabled. + */ +int rte_eal_has_hugepages(void); + +/** + * Whether EAL is using PCI bus. + * Disabled by --no-pci option. + * + * @return + * Nonzero if the PCI bus is enabled. + */ +int rte_eal_has_pci(void); + +/** + * Whether the EAL was asked to create UIO device. + * + * @return + * Nonzero if true. + */ +int rte_eal_create_uio_dev(void); + +/** + * The user-configured vfio interrupt mode. + * + * @return + * Interrupt mode configured with the command line, + * RTE_INTR_MODE_NONE by default. + */ +enum rte_intr_mode rte_eal_vfio_intr_mode(void); + +/** + * A wrap API for syscall gettid. + * + * @return + * On success, returns the thread ID of calling process. + * It is always successful. + */ +int rte_sys_gettid(void); + +/** + * Get system unique thread id. + * + * @return + * On success, returns the thread ID of calling process. + * It is always successful. + */ +static inline int rte_gettid(void) +{ + static RTE_DEFINE_PER_LCORE(int, _thread_id) = -1; + if (RTE_PER_LCORE(_thread_id) == -1) + RTE_PER_LCORE(_thread_id) = rte_sys_gettid(); + return RTE_PER_LCORE(_thread_id); +} + +/** + * Get the iova mode + * + * @return + * enum rte_iova_mode value. + */ +enum rte_iova_mode rte_eal_iova_mode(void); + +/** + * Get user provided pool ops name for mbuf + * + * @return + * returns user provided pool ops name. + */ +const char * +rte_eal_mbuf_user_pool_ops(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EAL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_interrupts.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_interrupts.h new file mode 100644 index 00000000..6eb49327 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_interrupts.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_INTERRUPTS_H_ +#error "don't include this file directly, please include generic " +#endif + +/** + * @file rte_eal_interrupts.h + * @internal + * + * Contains function prototypes exposed by the EAL for interrupt handling by + * drivers and other DPDK internal consumers. + */ + +#ifndef _RTE_EAL_INTERRUPTS_H_ +#define _RTE_EAL_INTERRUPTS_H_ + +#define RTE_MAX_RXTX_INTR_VEC_ID 32 +#define RTE_INTR_VEC_ZERO_OFFSET 0 +#define RTE_INTR_VEC_RXTX_OFFSET 1 + +/** + * The interrupt source type, e.g. UIO, VFIO, ALARM etc. + */ +enum rte_intr_handle_type { + RTE_INTR_HANDLE_UNKNOWN = 0, /**< generic unknown handle */ + RTE_INTR_HANDLE_UIO, /**< uio device handle */ + RTE_INTR_HANDLE_UIO_INTX, /**< uio generic handle */ + RTE_INTR_HANDLE_VFIO_LEGACY, /**< vfio device handle (legacy) */ + RTE_INTR_HANDLE_VFIO_MSI, /**< vfio device handle (MSI) */ + RTE_INTR_HANDLE_VFIO_MSIX, /**< vfio device handle (MSIX) */ + RTE_INTR_HANDLE_ALARM, /**< alarm handle */ + RTE_INTR_HANDLE_EXT, /**< external handler */ + RTE_INTR_HANDLE_VDEV, /**< virtual device */ + RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */ + RTE_INTR_HANDLE_MAX /**< count of elements */ +}; + +#define RTE_INTR_EVENT_ADD 1UL +#define RTE_INTR_EVENT_DEL 2UL + +typedef void (*rte_intr_event_cb_t)(int fd, void *arg); + +struct rte_epoll_data { + uint32_t event; /**< event type */ + void *data; /**< User data */ + rte_intr_event_cb_t cb_fun; /**< IN: callback fun */ + void *cb_arg; /**< IN: callback arg */ +}; + +enum { + RTE_EPOLL_INVALID = 0, + RTE_EPOLL_VALID, + RTE_EPOLL_EXEC, +}; + +/** interrupt epoll event obj, taken by epoll_event.ptr */ +struct rte_epoll_event { + volatile uint32_t status; /**< OUT: event status */ + int fd; /**< OUT: event fd */ + int epfd; /**< OUT: epoll instance the ev associated with */ + struct rte_epoll_data epdata; +}; + +/** Handle for interrupts. */ +struct rte_intr_handle { + RTE_STD_C11 + union { + int vfio_dev_fd; /**< VFIO device file descriptor */ + int uio_cfg_fd; /**< UIO cfg file desc for uio_pci_generic */ + }; + int fd; /**< interrupt event file descriptor */ + enum rte_intr_handle_type type; /**< handle type */ + uint32_t max_intr; /**< max interrupt requested */ + uint32_t nb_efd; /**< number of available efd(event fd) */ + uint8_t efd_counter_size; /**< size of efd counter, used for vdev */ + int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /**< intr vectors/efds mapping */ + struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID]; + /**< intr vector epoll event */ + int *intr_vec; /**< intr vector number array */ +}; + +#define RTE_EPOLL_PER_THREAD -1 /**< to hint using per thread epfd */ + +/** + * It waits for events on the epoll instance. + * + * @param epfd + * Epoll instance fd on which the caller wait for events. + * @param events + * Memory area contains the events that will be available for the caller. + * @param maxevents + * Up to maxevents are returned, must greater than zero. + * @param timeout + * Specifying a timeout of -1 causes a block indefinitely. + * Specifying a timeout equal to zero cause to return immediately. + * @return + * - On success, returns the number of available event. + * - On failure, a negative value. + */ +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout); + +/** + * It performs control operations on epoll instance referred by the epfd. + * It requests that the operation op be performed for the target fd. + * + * @param epfd + * Epoll instance fd on which the caller perform control operations. + * @param op + * The operation be performed for the target fd. + * @param fd + * The target fd on which the control ops perform. + * @param event + * Describes the object linked to the fd. + * Note: The caller must take care the object deletion after CTL_DEL. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event); + +/** + * The function returns the per thread epoll instance. + * + * @return + * epfd the epoll instance referred to. + */ +int +rte_intr_tls_epfd(void); + +/** + * @param intr_handle + * Pointer to the interrupt handle. + * @param epfd + * Epoll instance fd which the intr vector associated to. + * @param op + * The operation be performed for the vector. + * Operation type of {ADD, DEL}. + * @param vec + * RX intr vector number added to the epoll instance wait list. + * @param data + * User raw data. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data); + +/** + * It deletes registered eventfds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle); + +/** + * It enables the packet I/O interrupt event if it's necessary. + * It creates event fd for each interrupt vector when MSIX is used, + * otherwise it multiplexes a single event fd. + * + * @param intr_handle + * Pointer to the interrupt handle. + * @param nb_efd + * Number of interrupt vector trying to enable. + * The value 0 is not allowed. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd); + +/** + * It disables the packet I/O interrupt event. + * It deletes registered eventfds and closes the open fds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle); + +/** + * The packet I/O interrupt on datapath is enabled or not. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle); + +/** + * The interrupt handle instance allows other causes or not. + * Other causes stand for any none packet I/O interrupts. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle); + +/** + * The multiple interrupt vector capability of interrupt handle instance. + * It returns zero if no multiple interrupt vector support. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle); + +#endif /* _RTE_EAL_INTERRUPTS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_memconfig.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_memconfig.h new file mode 100644 index 00000000..aff0688d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_eal_memconfig.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_EAL_MEMCONFIG_H_ +#define _RTE_EAL_MEMCONFIG_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * memseg list is a special case as we need to store a bunch of other data + * together with the array itself. + */ +struct rte_memseg_list { + RTE_STD_C11 + union { + void *base_va; + /**< Base virtual address for this memseg list. */ + uint64_t addr_64; + /**< Makes sure addr is always 64-bits */ + }; + int socket_id; /**< Socket ID for all memsegs in this list. */ + uint64_t page_sz; /**< Page size for all memsegs in this list. */ + volatile uint32_t version; /**< version number for multiprocess sync. */ + struct rte_fbarray memseg_arr; +}; + +/** + * the structure for the memory configuration for the RTE. + * Used by the rte_config structure. It is separated out, as for multi-process + * support, the memory details should be shared across instances + */ +struct rte_mem_config { + volatile uint32_t magic; /**< Magic number - Sanity check. */ + + /* memory topology */ + uint32_t nchannel; /**< Number of channels (0 if unknown). */ + uint32_t nrank; /**< Number of ranks (0 if unknown). */ + + /** + * current lock nest order + * - qlock->mlock (ring/hash/lpm) + * - mplock->qlock->mlock (mempool) + * Notice: + * *ALWAYS* obtain qlock first if having to obtain both qlock and mlock + */ + rte_rwlock_t mlock; /**< only used by memzone LIB for thread-safe. */ + rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */ + rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */ + + rte_rwlock_t memory_hotplug_lock; + /**< indicates whether memory hotplug request is in progress. */ + + /* memory segments and zones */ + struct rte_fbarray memzones; /**< Memzone descriptors. */ + + struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS]; + /**< list of dynamic arrays holding memsegs */ + + struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */ + + /* Heaps of Malloc per socket */ + struct malloc_heap malloc_heaps[RTE_MAX_NUMA_NODES]; + + /* address of mem_config in primary process. used to map shared config into + * exact same address the primary process maps it. + */ + uint64_t mem_cfg_addr; +} __attribute__((__packed__)); + + +inline static void +rte_eal_mcfg_wait_complete(struct rte_mem_config* mcfg) +{ + /* wait until shared mem_config finish initialising */ + while(mcfg->magic != RTE_MAGIC) + rte_pause(); +} + +#ifdef __cplusplus +} +#endif + +#endif /*__RTE_EAL_MEMCONFIG_H_*/ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_errno.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_errno.h new file mode 100644 index 00000000..ba45591d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_errno.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * + * API for error cause tracking + */ + +#ifndef _RTE_ERRNO_H_ +#define _RTE_ERRNO_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +RTE_DECLARE_PER_LCORE(int, _rte_errno); /**< Per core error number. */ + +/** + * Error number value, stored per-thread, which can be queried after + * calls to certain functions to determine why those functions failed. + * + * Uses standard values from errno.h wherever possible, with a small number + * of additional possible values for RTE-specific conditions. + */ +#define rte_errno RTE_PER_LCORE(_rte_errno) + +/** + * Function which returns a printable string describing a particular + * error code. For non-RTE-specific error codes, this function returns + * the value from the libc strerror function. + * + * @param errnum + * The error number to be looked up - generally the value of rte_errno + * @return + * A pointer to a thread-local string containing the text describing + * the error. + */ +const char *rte_strerror(int errnum); + +#ifndef __ELASTERROR +/** + * Check if we have a defined value for the max system-defined errno values. + * if no max defined, start from 1000 to prevent overlap with standard values + */ +#define __ELASTERROR 1000 +#endif + +/** Error types */ +enum { + RTE_MIN_ERRNO = __ELASTERROR, /**< Start numbering above std errno vals */ + + E_RTE_SECONDARY, /**< Operation not allowed in secondary processes */ + E_RTE_NO_CONFIG, /**< Missing rte_config */ + + RTE_MAX_ERRNO /**< Max RTE error number */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ERRNO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_fbarray.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_fbarray.h new file mode 100644 index 00000000..5d880551 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_fbarray.h @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef RTE_FBARRAY_H +#define RTE_FBARRAY_H + +/** + * @file + * + * File-backed shared indexed array for DPDK. + * + * Basic workflow is expected to be the following: + * 1) Allocate array either using ``rte_fbarray_init()`` or + * ``rte_fbarray_attach()`` (depending on whether it's shared between + * multiple DPDK processes) + * 2) find free spots using ``rte_fbarray_find_next_free()`` + * 3) get pointer to data in the free spot using ``rte_fbarray_get()``, and + * copy data into the pointer (element size is fixed) + * 4) mark entry as used using ``rte_fbarray_set_used()`` + * + * Calls to ``rte_fbarray_init()`` and ``rte_fbarray_destroy()`` will have + * consequences for all processes, while calls to ``rte_fbarray_attach()`` and + * ``rte_fbarray_detach()`` will only have consequences within a single process. + * Therefore, it is safe to call ``rte_fbarray_attach()`` or + * ``rte_fbarray_detach()`` while another process is using ``rte_fbarray``, + * provided no other thread within the same process will try to use + * ``rte_fbarray`` before attaching or after detaching. It is not safe to call + * ``rte_fbarray_init()`` or ``rte_fbarray_destroy()`` while another thread or + * another process is using ``rte_fbarray``. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include + +#define RTE_FBARRAY_NAME_LEN 64 + +struct rte_fbarray { + char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */ + unsigned int count; /**< number of entries stored */ + unsigned int len; /**< current length of the array */ + unsigned int elt_sz; /**< size of each element */ + void *data; /**< data pointer */ + rte_rwlock_t rwlock; /**< multiprocess lock */ +}; + +/** + * Set up ``rte_fbarray`` structure and allocate underlying resources. + * + * Call this function to correctly set up ``rte_fbarray`` and allocate + * underlying files that will be backing the data in the current process. Note + * that in order to use and share ``rte_fbarray`` between multiple processes, + * data pointed to by ``arr`` pointer must itself be allocated in shared memory. + * + * @param arr + * Valid pointer to allocated ``rte_fbarray`` structure. + * + * @param name + * Unique name to be assigned to this array. + * + * @param len + * Number of elements initially available in the array. + * + * @param elt_sz + * Size of each element. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz); + + +/** + * Attach to a file backing an already allocated and correctly set up + * ``rte_fbarray`` structure. + * + * Call this function to attach to file that will be backing the data in the + * current process. The structure must have been previously correctly set up + * with a call to ``rte_fbarray_init()``. Calls to ``rte_fbarray_attach()`` are + * usually meant to be performed in a multiprocessing scenario, with data + * pointed to by ``arr`` pointer allocated in shared memory. + * + * @param arr + * Valid pointer to allocated and correctly set up rte_fbarray structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_attach(struct rte_fbarray *arr); + + +/** + * Deallocate resources for an already allocated and correctly set up + * ``rte_fbarray`` structure, and remove the underlying file. + * + * Call this function to deallocate all resources associated with an + * ``rte_fbarray`` structure within the current process. This will also + * zero-fill data pointed to by ``arr`` pointer and remove the underlying file + * backing the data, so it is expected that by the time this function is called, + * all other processes have detached from this ``rte_fbarray``. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_destroy(struct rte_fbarray *arr); + + +/** + * Deallocate resources for an already allocated and correctly set up + * ``rte_fbarray`` structure. + * + * Call this function to deallocate all resources associated with an + * ``rte_fbarray`` structure within current process. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_detach(struct rte_fbarray *arr); + + +/** + * Get pointer to element residing at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Index of an element to get a pointer to. + * + * @return + * - non-NULL pointer on success. + * - NULL on failure, with ``rte_errno`` indicating reason for failure. + */ +void * __rte_experimental +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx); + + +/** + * Find index of a specified element within the array. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param elt + * Pointer to element to find index to. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt); + + +/** + * Mark specified element as used. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to mark as used. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Mark specified element as free. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to mark as free. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Check whether element at specified index is marked as used. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to check as used. + * + * @return + * - 1 if element is used. + * - 0 if element is unused. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Find index of next free element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of next used element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of next chunk of ``n`` free elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of free elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find index of next chunk of ``n`` used elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of used elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find how many more free entries there are, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_contig_free(struct rte_fbarray *arr, + unsigned int start); + + +/** + * Find how many more used entries there are, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start); + +/** + * Find index of previous free element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of previous used element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find lowest start index of chunk of ``n`` free elements, down from specified + * index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of free elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find lowest start index of chunk of ``n`` used elements, down from specified + * index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of used elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find how many more free entries there are before specified index (like + * ``rte_fbarray_find_contig_free`` but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, + unsigned int start); + + +/** + * Find how many more used entries there are before specified index (like + * ``rte_fbarray_find_contig_used`` but going in reverse). + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Dump ``rte_fbarray`` metadata. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param f + * File object to dump information into. + */ +void __rte_experimental +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_FBARRAY_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_hexdump.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_hexdump.h new file mode 100644 index 00000000..2d03c089 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_hexdump.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_HEXDUMP_H_ +#define _RTE_HEXDUMP_H_ + +/** + * @file + * Simple API to dump out memory in a special hex format. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** +* Dump out memory in a special hex dump format. +* +* @param f +* A pointer to a file for output +* @param title +* If not NULL this string is printed as a header to the output. +* @param buf +* This is the buffer address to print out. +* @param len +* The number of bytes to dump out +* @return +* None. +*/ + +extern void +rte_hexdump(FILE *f, const char * title, const void * buf, unsigned int len); + +/** +* Dump out memory in a hex format with colons between bytes. +* +* @param f +* A pointer to a file for output +* @param title +* If not NULL this string is printed as a header to the output. +* @param buf +* This is the buffer address to print out. +* @param len +* The number of bytes to dump out +* @return +* None. +*/ + +void +rte_memdump(FILE *f, const char * title, const void * buf, unsigned int len); + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_HEXDUMP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_hypervisor.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_hypervisor.h new file mode 100644 index 00000000..5fe719c1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_hypervisor.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#ifndef RTE_HYPERVISOR_H +#define RTE_HYPERVISOR_H + +/** + * @file + * Hypervisor awareness. + */ + +enum rte_hypervisor { + RTE_HYPERVISOR_NONE, + RTE_HYPERVISOR_KVM, + RTE_HYPERVISOR_HYPERV, + RTE_HYPERVISOR_VMWARE, + RTE_HYPERVISOR_UNKNOWN +}; + +/** + * Get the id of hypervisor it is running on. + */ +enum rte_hypervisor +rte_hypervisor_get(void); + +/** + * Get the name of a given hypervisor id. + */ +const char * +rte_hypervisor_get_name(enum rte_hypervisor id); + +#endif /* RTE_HYPERVISOR_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_interrupts.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_interrupts.h new file mode 100644 index 00000000..d751a637 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_interrupts.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_INTERRUPTS_H_ +#define _RTE_INTERRUPTS_H_ + +#include + +/** + * @file + * + * The RTE interrupt interface provides functions to register/unregister + * callbacks for a specific interrupt. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** Interrupt handle */ +struct rte_intr_handle; + +/** Function to be registered for the specific interrupt */ +typedef void (*rte_intr_callback_fn)(void *cb_arg); + +#include "rte_eal_interrupts.h" + +/** + * It registers the callback for the specific interrupt. Multiple + * callbacks cal be registered at the same time. + * @param intr_handle + * Pointer to the interrupt handle. + * @param cb + * callback address. + * @param cb_arg + * address of parameter for callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg); + +/** + * It unregisters the callback according to the specified interrupt handle. + * + * @param intr_handle + * pointer to the interrupt handle. + * @param cb + * callback address. + * @param cb_arg + * address of parameter for callback, (void *)-1 means to remove all + * registered which has the same callback address. + * + * @return + * - On success, return the number of callback entities removed. + * - On failure, a negative value. + */ +int rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg); + +/** + * It enables the interrupt for the specified handle. + * + * @param intr_handle + * pointer to the interrupt handle. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_intr_enable(const struct rte_intr_handle *intr_handle); + +/** + * It disables the interrupt for the specified handle. + * + * @param intr_handle + * pointer to the interrupt handle. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_intr_disable(const struct rte_intr_handle *intr_handle); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_keepalive.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_keepalive.h new file mode 100644 index 00000000..e9f8f083 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_keepalive.h @@ -0,0 +1,170 @@ +/*- + * BSD LICENSE + * + * Copyright 2015-2016 Intel Shannon Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file rte_keepalive.h + * DPDK RTE LCore Keepalive Monitor. + * + **/ + +#ifndef _KEEPALIVE_H_ +#define _KEEPALIVE_H_ + +#include +#include + +#ifndef RTE_KEEPALIVE_MAXCORES +/** + * Number of cores to track. + * @note Must be larger than the highest core id. */ +#define RTE_KEEPALIVE_MAXCORES RTE_MAX_LCORE +#endif + +enum rte_keepalive_state { + RTE_KA_STATE_UNUSED = 0, + RTE_KA_STATE_ALIVE = 1, + RTE_KA_STATE_MISSING = 4, + RTE_KA_STATE_DEAD = 2, + RTE_KA_STATE_GONE = 3, + RTE_KA_STATE_DOZING = 5, + RTE_KA_STATE_SLEEP = 6 +}; + +/** + * Keepalive failure callback. + * + * Receives a data pointer passed to rte_keepalive_create() and the id of the + * failed core. + * @param data Data pointer passed to rte_keepalive_create() + * @param id_core ID of the core that has failed + */ +typedef void (*rte_keepalive_failure_callback_t)( + void *data, + const int id_core); + +/** + * Keepalive relay callback. + * + * Receives a data pointer passed to rte_keepalive_register_relay_callback(), + * the id of the core for which state is to be forwarded, and details of the + * current core state. + * @param data Data pointer passed to rte_keepalive_register_relay_callback() + * @param id_core ID of the core for which state is being reported + * @param core_state The current state of the core + * @param Timestamp of when core was last seen alive + */ +typedef void (*rte_keepalive_relay_callback_t)( + void *data, + const int id_core, + enum rte_keepalive_state core_state, + uint64_t last_seen + ); + +/** + * Keepalive state structure. + * @internal + */ +struct rte_keepalive; + +/** + * Initialise keepalive sub-system. + * @param callback + * Function called upon detection of a dead core. + * @param data + * Data pointer to be passed to function callback. + * @return + * Keepalive structure success, NULL on failure. + */ +struct rte_keepalive *rte_keepalive_create( + rte_keepalive_failure_callback_t callback, + void *data); + +/** + * Checks & handles keepalive state of monitored cores. + * @param *ptr_timer Triggering timer (unused) + * @param *ptr_data Data pointer (keepalive structure) + */ +void rte_keepalive_dispatch_pings(void *ptr_timer, void *ptr_data); + +/** + * Registers a core for keepalive checks. + * @param *keepcfg + * Keepalive structure pointer + * @param id_core + * ID number of core to register. + */ +void rte_keepalive_register_core(struct rte_keepalive *keepcfg, + const int id_core); + +/** + * Per-core keepalive check. + * @param *keepcfg + * Keepalive structure pointer + * + * This function needs to be called from within the main process loop of + * the LCore to be checked. + */ +void +rte_keepalive_mark_alive(struct rte_keepalive *keepcfg); + +/** + * Per-core sleep-time indication. + * @param *keepcfg + * Keepalive structure pointer + * + * If CPU idling is enabled, this function needs to be called from within + * the main process loop of the LCore going to sleep, in order to avoid + * the LCore being mis-detected as dead. + */ +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg); + +/** + * Registers a 'live core' callback. + * + * The complement of the 'dead core' callback. This is called when a + * core is known to be alive, and is intended for cases when an app + * needs to know 'liveness' beyond just knowing when a core has died. + * + * @param *keepcfg + * Keepalive structure pointer + * @param callback + * Function called upon detection of a dead core. + * @param data + * Data pointer to be passed to function callback. + */ +void +rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data); + +#endif /* _KEEPALIVE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_launch.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_launch.h new file mode 100644 index 00000000..06a67175 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_launch.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LAUNCH_H_ +#define _RTE_LAUNCH_H_ + +/** + * @file + * + * Launch tasks on other lcores + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * State of an lcore. + */ +enum rte_lcore_state_t { + WAIT, /**< waiting a new command */ + RUNNING, /**< executing command */ + FINISHED, /**< command executed */ +}; + +/** + * Definition of a remote launch function. + */ +typedef int (lcore_function_t)(void *); + +/** + * Launch a function on another lcore. + * + * To be executed on the MASTER lcore only. + * + * Sends a message to a slave lcore (identified by the slave_id) that + * is in the WAIT state (this is true after the first call to + * rte_eal_init()). This can be checked by first calling + * rte_eal_wait_lcore(slave_id). + * + * When the remote lcore receives the message, it switches to + * the RUNNING state, then calls the function f with argument arg. Once the + * execution is done, the remote lcore switches to a FINISHED state and + * the return value of f is stored in a local variable to be read using + * rte_eal_wait_lcore(). + * + * The MASTER lcore returns as soon as the message is sent and knows + * nothing about the completion of f. + * + * Note: This function is not designed to offer optimum + * performance. It is just a practical way to launch a function on + * another lcore at initialization time. + * + * @param f + * The function to be called. + * @param arg + * The argument for the function. + * @param slave_id + * The identifier of the lcore on which the function should be executed. + * @return + * - 0: Success. Execution of function f started on the remote lcore. + * - (-EBUSY): The remote lcore is not in a WAIT state. + */ +int rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned slave_id); + +/** + * This enum indicates whether the master core must execute the handler + * launched on all logical cores. + */ +enum rte_rmt_call_master_t { + SKIP_MASTER = 0, /**< lcore handler not executed by master core. */ + CALL_MASTER, /**< lcore handler executed by master core. */ +}; + +/** + * Launch a function on all lcores. + * + * Check that each SLAVE lcore is in a WAIT state, then call + * rte_eal_remote_launch() for each lcore. + * + * @param f + * The function to be called. + * @param arg + * The argument for the function. + * @param call_master + * If call_master set to SKIP_MASTER, the MASTER lcore does not call + * the function. If call_master is set to CALL_MASTER, the function + * is also called on master before returning. In any case, the master + * lcore returns as soon as it finished its job and knows nothing + * about the completion of f on the other lcores. + * @return + * - 0: Success. Execution of function f started on all remote lcores. + * - (-EBUSY): At least one remote lcore is not in a WAIT state. In this + * case, no message is sent to any of the lcores. + */ +int rte_eal_mp_remote_launch(lcore_function_t *f, void *arg, + enum rte_rmt_call_master_t call_master); + +/** + * Get the state of the lcore identified by slave_id. + * + * To be executed on the MASTER lcore only. + * + * @param slave_id + * The identifier of the lcore. + * @return + * The state of the lcore. + */ +enum rte_lcore_state_t rte_eal_get_lcore_state(unsigned slave_id); + +/** + * Wait until an lcore finishes its job. + * + * To be executed on the MASTER lcore only. + * + * If the slave lcore identified by the slave_id is in a FINISHED state, + * switch to the WAIT state. If the lcore is in RUNNING state, wait until + * the lcore finishes its job and moves to the FINISHED state. + * + * @param slave_id + * The identifier of the lcore. + * @return + * - 0: If the lcore identified by the slave_id is in a WAIT state. + * - The value that was returned by the previous remote launch + * function call if the lcore identified by the slave_id was in a + * FINISHED or RUNNING state. In this case, it changes the state + * of the lcore to WAIT. + */ +int rte_eal_wait_lcore(unsigned slave_id); + +/** + * Wait until all lcores finish their jobs. + * + * To be executed on the MASTER lcore only. Issue an + * rte_eal_wait_lcore() for every lcore. The return values are + * ignored. + * + * After a call to rte_eal_mp_wait_lcore(), the caller can assume + * that all slave lcores are in a WAIT state. + */ +void rte_eal_mp_wait_lcore(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LAUNCH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_lcore.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_lcore.h new file mode 100644 index 00000000..6e09d918 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_lcore.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LCORE_H_ +#define _RTE_LCORE_H_ + +/** + * @file + * + * API for lcore and socket manipulation + * + */ +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define LCORE_ID_ANY UINT32_MAX /**< Any lcore. */ + +#if defined(__linux__) + typedef cpu_set_t rte_cpuset_t; +#elif defined(__FreeBSD__) +#include + typedef cpuset_t rte_cpuset_t; +#endif + +/** + * Structure storing internal configuration (per-lcore) + */ +struct lcore_config { + unsigned detected; /**< true if lcore was detected */ + pthread_t thread_id; /**< pthread identifier */ + int pipe_master2slave[2]; /**< communication pipe with master */ + int pipe_slave2master[2]; /**< communication pipe with master */ + lcore_function_t * volatile f; /**< function to call */ + void * volatile arg; /**< argument of function */ + volatile int ret; /**< return value of function */ + volatile enum rte_lcore_state_t state; /**< lcore state */ + unsigned socket_id; /**< physical socket id for this lcore */ + unsigned core_id; /**< core number on socket for this lcore */ + int core_index; /**< relative index, starting from 0 */ + rte_cpuset_t cpuset; /**< cpu set which the lcore affinity to */ + uint8_t core_role; /**< role of core eg: OFF, RTE, SERVICE */ +}; + +/** + * Internal configuration (per-lcore) + */ +extern struct lcore_config lcore_config[RTE_MAX_LCORE]; + +RTE_DECLARE_PER_LCORE(unsigned, _lcore_id); /**< Per thread "lcore id". */ +RTE_DECLARE_PER_LCORE(rte_cpuset_t, _cpuset); /**< Per thread "cpuset". */ + +/** + * Return the Application thread ID of the execution unit. + * + * Note: in most cases the lcore id returned here will also correspond + * to the processor id of the CPU on which the thread is pinned, this + * will not be the case if the user has explicitly changed the thread to + * core affinities using --lcores EAL argument e.g. --lcores '(0-3)@10' + * to run threads with lcore IDs 0, 1, 2 and 3 on physical core 10.. + * + * @return + * Logical core ID (in EAL thread) or LCORE_ID_ANY (in non-EAL thread) + */ +static inline unsigned +rte_lcore_id(void) +{ + return RTE_PER_LCORE(_lcore_id); +} + +/** + * Get the id of the master lcore + * + * @return + * the id of the master lcore + */ +static inline unsigned +rte_get_master_lcore(void) +{ + return rte_eal_get_configuration()->master_lcore; +} + +/** + * Return the number of execution units (lcores) on the system. + * + * @return + * the number of execution units (lcores) on the system. + */ +static inline unsigned +rte_lcore_count(void) +{ + const struct rte_config *cfg = rte_eal_get_configuration(); + return cfg->lcore_count; +} + +/** + * Return the index of the lcore starting from zero. + * + * When option -c or -l is given, the index corresponds + * to the order in the list. + * For example: + * -c 0x30, lcore 4 has index 0, and 5 has index 1. + * -l 22,18 lcore 22 has index 0, and 18 has index 1. + * + * @param lcore_id + * The targeted lcore, or -1 for the current one. + * @return + * The relative index, or -1 if not enabled. + */ +static inline int +rte_lcore_index(int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) + return -1; + if (lcore_id < 0) + lcore_id = (int)rte_lcore_id(); + return lcore_config[lcore_id].core_index; +} + +/** + * Return the ID of the physical socket of the logical core we are + * running on. + * @return + * the ID of current lcoreid's physical socket + */ +unsigned rte_socket_id(void); + +/** + * Return number of physical sockets detected on the system. + * + * Note that number of nodes may not be correspondent to their physical id's: + * for example, a system may report two socket id's, but the actual socket id's + * may be 0 and 8. + * + * @return + * the number of physical sockets as recognized by EAL + */ +unsigned int __rte_experimental +rte_socket_count(void); + +/** + * Return socket id with a particular index. + * + * This will return socket id at a particular position in list of all detected + * physical socket id's. For example, on a machine with sockets [0, 8], passing + * 1 as a parameter will return 8. + * + * @param idx + * index of physical socket id to return + * + * @return + * - physical socket id as recognized by EAL + * - -1 on error, with errno set to EINVAL + */ +int __rte_experimental +rte_socket_id_by_idx(unsigned int idx); + +/** + * Get the ID of the physical socket of the specified lcore + * + * @param lcore_id + * the targeted lcore, which MUST be between 0 and RTE_MAX_LCORE-1. + * @return + * the ID of lcoreid's physical socket + */ +static inline unsigned +rte_lcore_to_socket_id(unsigned lcore_id) +{ + return lcore_config[lcore_id].socket_id; +} + +/** + * Test if an lcore is enabled. + * + * @param lcore_id + * The identifier of the lcore, which MUST be between 0 and + * RTE_MAX_LCORE-1. + * @return + * True if the given lcore is enabled; false otherwise. + */ +static inline int +rte_lcore_is_enabled(unsigned lcore_id) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + if (lcore_id >= RTE_MAX_LCORE) + return 0; + return cfg->lcore_role[lcore_id] == ROLE_RTE; +} + +/** + * Get the next enabled lcore ID. + * + * @param i + * The current lcore (reference). + * @param skip_master + * If true, do not return the ID of the master lcore. + * @param wrap + * If true, go back to 0 when RTE_MAX_LCORE is reached; otherwise, + * return RTE_MAX_LCORE. + * @return + * The next lcore_id or RTE_MAX_LCORE if not found. + */ +static inline unsigned +rte_get_next_lcore(unsigned i, int skip_master, int wrap) +{ + i++; + if (wrap) + i %= RTE_MAX_LCORE; + + while (i < RTE_MAX_LCORE) { + if (!rte_lcore_is_enabled(i) || + (skip_master && (i == rte_get_master_lcore()))) { + i++; + if (wrap) + i %= RTE_MAX_LCORE; + continue; + } + break; + } + return i; +} +/** + * Macro to browse all running lcores. + */ +#define RTE_LCORE_FOREACH(i) \ + for (i = rte_get_next_lcore(-1, 0, 0); \ + i +#include +#include +#include + +#include +#include + +struct rte_log_dynamic_type; + +/** The rte_log structure. */ +struct rte_logs { + uint32_t type; /**< Bitfield with enabled logs. */ + uint32_t level; /**< Log level. */ + FILE *file; /**< Output file set by rte_openlog_stream, or NULL. */ + size_t dynamic_types_len; + struct rte_log_dynamic_type *dynamic_types; +}; + +/** Global log informations */ +extern struct rte_logs rte_logs; + +/* SDK log type */ +#define RTE_LOGTYPE_EAL 0 /**< Log related to eal. */ +#define RTE_LOGTYPE_MALLOC 1 /**< Log related to malloc. */ +#define RTE_LOGTYPE_RING 2 /**< Log related to ring. */ +#define RTE_LOGTYPE_MEMPOOL 3 /**< Log related to mempool. */ +#define RTE_LOGTYPE_TIMER 4 /**< Log related to timers. */ +#define RTE_LOGTYPE_PMD 5 /**< Log related to poll mode driver. */ +#define RTE_LOGTYPE_HASH 6 /**< Log related to hash table. */ +#define RTE_LOGTYPE_LPM 7 /**< Log related to LPM. */ +#define RTE_LOGTYPE_KNI 8 /**< Log related to KNI. */ +#define RTE_LOGTYPE_ACL 9 /**< Log related to ACL. */ +#define RTE_LOGTYPE_POWER 10 /**< Log related to power. */ +#define RTE_LOGTYPE_METER 11 /**< Log related to QoS meter. */ +#define RTE_LOGTYPE_SCHED 12 /**< Log related to QoS port scheduler. */ +#define RTE_LOGTYPE_PORT 13 /**< Log related to port. */ +#define RTE_LOGTYPE_TABLE 14 /**< Log related to table. */ +#define RTE_LOGTYPE_PIPELINE 15 /**< Log related to pipeline. */ +#define RTE_LOGTYPE_MBUF 16 /**< Log related to mbuf. */ +#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */ +#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */ +#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */ +#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */ + +/* these log types can be used in an application */ +#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */ +#define RTE_LOGTYPE_USER2 25 /**< User-defined log type 2. */ +#define RTE_LOGTYPE_USER3 26 /**< User-defined log type 3. */ +#define RTE_LOGTYPE_USER4 27 /**< User-defined log type 4. */ +#define RTE_LOGTYPE_USER5 28 /**< User-defined log type 5. */ +#define RTE_LOGTYPE_USER6 29 /**< User-defined log type 6. */ +#define RTE_LOGTYPE_USER7 30 /**< User-defined log type 7. */ +#define RTE_LOGTYPE_USER8 31 /**< User-defined log type 8. */ + +/** First identifier for extended logs */ +#define RTE_LOGTYPE_FIRST_EXT_ID 32 + +/* Can't use 0, as it gives compiler warnings */ +#define RTE_LOG_EMERG 1U /**< System is unusable. */ +#define RTE_LOG_ALERT 2U /**< Action must be taken immediately. */ +#define RTE_LOG_CRIT 3U /**< Critical conditions. */ +#define RTE_LOG_ERR 4U /**< Error conditions. */ +#define RTE_LOG_WARNING 5U /**< Warning conditions. */ +#define RTE_LOG_NOTICE 6U /**< Normal but significant condition. */ +#define RTE_LOG_INFO 7U /**< Informational. */ +#define RTE_LOG_DEBUG 8U /**< Debug-level messages. */ + +/** + * Change the stream that will be used by the logging system. + * + * This can be done at any time. The f argument represents the stream + * to be used to send the logs. If f is NULL, the default output is + * used (stderr). + * + * @param f + * Pointer to the stream. + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_openlog_stream(FILE *f); + +/** + * Set the global log level. + * + * After this call, logs with a level lower or equal than the level + * passed as argument will be displayed. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + */ +void rte_log_set_global_level(uint32_t level); + +/** + * Get the global log level. + * + * @return + * The current global log level. + */ +uint32_t rte_log_get_global_level(void); + +/** + * Get the log level for a given type. + * + * @param logtype + * The log type identifier. + * @return + * 0 on success, a negative value if logtype is invalid. + */ +int rte_log_get_level(uint32_t logtype); + +/** + * Set the log level for a given type based on shell pattern. + * + * @param pattern + * The match pattern identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_pattern(const char *pattern, uint32_t level); + +/** + * Set the log level for a given type based on regular expression. + * + * @param regex + * The regular expression identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_regexp(const char *regex, uint32_t level); + +/** + * Set the log level for a given type. + * + * @param logtype + * The log type identifier. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if logtype or level is invalid. + */ +int rte_log_set_level(uint32_t logtype, uint32_t level); + +/** + * Get the current loglevel for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The loglevel of the message being processed. + */ +int rte_log_cur_msg_loglevel(void); + +/** + * Get the current logtype for the message being processed. + * + * Before calling the user-defined stream for logging, the log + * subsystem sets a per-lcore variable containing the loglevel and the + * logtype of the message being processed. This information can be + * accessed by the user-defined log output function through this + * function. + * + * @return + * The logtype of the message being processed. + */ +int rte_log_cur_msg_logtype(void); + +/** + * Register a dynamic log type + * + * If a log is already registered with the same type, the returned value + * is the same than the previous one. + * + * @param name + * The string identifying the log type. + * @return + * - >0: success, the returned value is the log type identifier. + * - (-ENOMEM): cannot allocate memory. + */ +int rte_log_register(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a dynamic log type and try to pick its level from EAL options + * + * rte_log_register() is called inside. If successful, the function tries + * to search for matching regexp in the list of EAL log level options and + * pick the level from the last matching entry. If nothing can be applied + * from the list, the level will be set to the user-defined default value. + * + * @param name + * Name for the log type to be registered + * @param level_def + * Fallback level to be set if the global list has no matching options + * @return + * - >=0: the newly registered log type + * - <0: rte_log_register() error value + */ +int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def); + +/** + * Dump log information. + * + * Dump the global level and the registered log types. + * + * @param f + * The output stream where the dump should be sent. + */ +void rte_log_dump(FILE *f); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __attribute__((cold)) +#endif +#endif + __attribute__((format(printf, 3, 4))); + +/** + * Generates a log message. + * + * The message will be sent in the stream defined by the previous call + * to rte_openlog_stream(). + * + * The level argument determines if the log should be displayed or + * not, depending on the global rte_logs variable. A trailing + * newline may be added if needed. + * + * The preferred alternative is the RTE_LOG() because it adds the + * level and type in the logged string. + * + * @param level + * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8). + * @param logtype + * The log type, for example, RTE_LOGTYPE_EAL. + * @param format + * The format string, as in printf(3), followed by the variable arguments + * required by the format. + * @param ap + * The va_list of the variable arguments required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +int rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) + __attribute__((format(printf,3,0))); + +/** + * Generates a log message. + * + * The RTE_LOG() is a helper that prefixes the string with the log level + * and type, and call rte_log(). + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG(l, t, ...) \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) + +/** + * Generates a log message for data path. + * + * Similar to RTE_LOG(), except that it is removed at compilation time + * if the RTE_LOG_DP_LEVEL configuration option is lower than the log + * level argument. + * + * @param l + * Log level. A value between EMERG (1) and DEBUG (8). The short name is + * expanded by the macro, so it cannot be an integer value. + * @param t + * The log type, for example, EAL. The short name is expanded by the + * macro, so it cannot be an integer value. + * @param ... + * The fmt string, as in printf(3), followed by the variable arguments + * required by the format. + * @return + * - 0: Success. + * - Negative on error. + */ +#define RTE_LOG_DP(l, t, ...) \ + (void)((RTE_LOG_ ## l <= RTE_LOG_DP_LEVEL) ? \ + rte_log(RTE_LOG_ ## l, \ + RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) : \ + 0) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LOG_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc.h new file mode 100644 index 00000000..a9fb7e45 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc.h @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MALLOC_H_ +#define _RTE_MALLOC_H_ + +/** + * @file + * RTE Malloc. This library provides methods for dynamically allocating memory + * from hugepages. + */ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Structure to hold heap statistics obtained from rte_malloc_get_socket_stats function. + */ +struct rte_malloc_socket_stats { + size_t heap_totalsz_bytes; /**< Total bytes on heap */ + size_t heap_freesz_bytes; /**< Total free bytes on heap */ + size_t greatest_free_size; /**< Size in bytes of largest free block */ + unsigned free_count; /**< Number of free elements on heap */ + unsigned alloc_count; /**< Number of allocated elements on heap */ + size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */ +}; + +/** + * This function allocates memory from the huge-page area of memory. The memory + * is not cleared. In NUMA systems, the memory allocated resides on the same + * NUMA socket as the core that calls this function. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_malloc(const char *type, size_t size, unsigned align); + +/** + * Allocate zero'ed memory from the heap. + * + * Equivalent to rte_malloc() except that the memory zone is + * initialised with zeros. In NUMA systems, the memory allocated resides on the + * same NUMA socket as the core that calls this function. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_zmalloc(const char *type, size_t size, unsigned align); + +/** + * Replacement function for calloc(), using huge-page memory. Memory area is + * initialised with zeros. In NUMA systems, the memory allocated resides on the + * same NUMA socket as the core that calls this function. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param num + * Number of elements to be allocated. + * @param size + * Size (in bytes) of a single element. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_calloc(const char *type, size_t num, size_t size, unsigned align); + +/** + * Replacement function for realloc(), using huge-page memory. Reserved area + * memory is resized, preserving contents. In NUMA systems, the new area + * resides on the same NUMA socket as the old area. + * + * @param ptr + * Pointer to already allocated memory + * @param size + * Size (in bytes) of new area. If this is 0, memory is freed. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the reallocated memory. + */ +void * +rte_realloc(void *ptr, size_t size, unsigned align); + +/** + * This function allocates memory from the huge-page area of memory. The memory + * is not cleared. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this function + * will behave the same as rte_malloc(). + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_malloc_socket(const char *type, size_t size, unsigned align, int socket); + +/** + * Allocate zero'ed memory from the heap. + * + * Equivalent to rte_malloc() except that the memory zone is + * initialised with zeros. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param size + * Size (in bytes) to be allocated. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this function + * will behave the same as rte_zmalloc(). + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket); + +/** + * Replacement function for calloc(), using huge-page memory. Memory area is + * initialised with zeros. + * + * @param type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param num + * Number of elements to be allocated. + * @param size + * Size (in bytes) of a single element. + * @param align + * If 0, the return is a pointer that is suitably aligned for any kind of + * variable (in the same manner as malloc()). + * Otherwise, the return is a pointer that is a multiple of *align*. In + * this case, it must obviously be a power of two. (Minimum alignment is the + * cacheline size, i.e. 64-bytes) + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this function + * will behave the same as rte_calloc(). + * @return + * - NULL on error. Not enough memory, or invalid arguments (size is 0, + * align is not a power of two). + * - Otherwise, the pointer to the allocated object. + */ +void * +rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket); + +/** + * Frees the memory space pointed to by the provided pointer. + * + * This pointer must have been returned by a previous call to + * rte_malloc(), rte_zmalloc(), rte_calloc() or rte_realloc(). The behaviour of + * rte_free() is undefined if the pointer does not match this requirement. + * + * If the pointer is NULL, the function does nothing. + * + * @param ptr + * The pointer to memory to be freed. + */ +void +rte_free(void *ptr); + +/** + * If malloc debug is enabled, check a memory block for header + * and trailer markers to indicate that all is well with the block. + * If size is non-null, also return the size of the block. + * + * @param ptr + * pointer to the start of a data block, must have been returned + * by a previous call to rte_malloc(), rte_zmalloc(), rte_calloc() + * or rte_realloc() + * @param size + * if non-null, and memory block pointer is valid, returns the size + * of the memory block + * @return + * -1 on error, invalid pointer passed or header and trailer markers + * are missing or corrupted + * 0 on success + */ +int +rte_malloc_validate(const void *ptr, size_t *size); + +/** + * Get heap statistics for the specified heap. + * + * @param socket + * An unsigned integer specifying the socket to get heap statistics for + * @param socket_stats + * A structure which provides memory to store statistics + * @return + * Null on error + * Pointer to structure storing statistics on success + */ +int +rte_malloc_get_socket_stats(int socket, + struct rte_malloc_socket_stats *socket_stats); + +/** + * Dump statistics. + * + * Dump for the specified type to a file. If the type argument is + * NULL, all memory types will be dumped. + * + * @param f + * A pointer to a file for output + * @param type + * A string identifying the type of objects to dump, or NULL + * to dump all objects. + */ +void +rte_malloc_dump_stats(FILE *f, const char *type); + +/** + * Dump contents of all malloc heaps to a file. + * + * @param f + * A pointer to a file for output + */ +void __rte_experimental +rte_malloc_dump_heaps(FILE *f); + +/** + * Set the maximum amount of allocated memory for this type. + * + * This is not yet implemented + * + * @param type + * A string identifying the type of allocated objects. + * @param max + * The maximum amount of allocated bytes for this type. + * @return + * - 0: Success. + * - (-1): Error. + */ +int +rte_malloc_set_limit(const char *type, size_t max); + +/** + * Return the IO address of a virtual address obtained through + * rte_malloc + * + * @param addr + * Address obtained from a previous rte_malloc call + * @return + * RTE_BAD_IOVA on error + * otherwise return an address suitable for IO + */ +rte_iova_t +rte_malloc_virt2iova(const void *addr); + +__rte_deprecated +static inline phys_addr_t +rte_malloc_virt2phy(const void *addr) +{ + return rte_malloc_virt2iova(addr); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MALLOC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc_heap.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc_heap.h new file mode 100644 index 00000000..d43fa909 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_malloc_heap.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MALLOC_HEAP_H_ +#define _RTE_MALLOC_HEAP_H_ + +#include +#include +#include +#include + +/* Number of free lists per heap, grouped by size. */ +#define RTE_HEAP_NUM_FREELISTS 13 + +/* dummy definition, for pointers */ +struct malloc_elem; + +/** + * Structure to hold malloc heap + */ +struct malloc_heap { + rte_spinlock_t lock; + LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS]; + struct malloc_elem *volatile first; + struct malloc_elem *volatile last; + + unsigned alloc_count; + size_t total_size; +} __rte_cache_aligned; + +#endif /* _RTE_MALLOC_HEAP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_memory.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_memory.h new file mode 100644 index 00000000..c4b7f4cf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_memory.h @@ -0,0 +1,506 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMORY_H_ +#define _RTE_MEMORY_H_ + +/** + * @file + * + * Memory-related RTE API. + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* forward declaration for pointers */ +struct rte_memseg_list; + +__extension__ +enum rte_page_sizes { + RTE_PGSIZE_4K = 1ULL << 12, + RTE_PGSIZE_64K = 1ULL << 16, + RTE_PGSIZE_256K = 1ULL << 18, + RTE_PGSIZE_2M = 1ULL << 21, + RTE_PGSIZE_16M = 1ULL << 24, + RTE_PGSIZE_256M = 1ULL << 28, + RTE_PGSIZE_512M = 1ULL << 29, + RTE_PGSIZE_1G = 1ULL << 30, + RTE_PGSIZE_4G = 1ULL << 32, + RTE_PGSIZE_16G = 1ULL << 34, +}; + +#define SOCKET_ID_ANY -1 /**< Any NUMA socket. */ +#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1) /**< Cache line mask. */ + +#define RTE_CACHE_LINE_ROUNDUP(size) \ + (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE)) +/**< Return the first cache-aligned value greater or equal to size. */ + +/**< Cache line size in terms of log2 */ +#if RTE_CACHE_LINE_SIZE == 64 +#define RTE_CACHE_LINE_SIZE_LOG2 6 +#elif RTE_CACHE_LINE_SIZE == 128 +#define RTE_CACHE_LINE_SIZE_LOG2 7 +#else +#error "Unsupported cache line size" +#endif + +#define RTE_CACHE_LINE_MIN_SIZE 64 /**< Minimum Cache line size. */ + +/** + * Force alignment to cache line. + */ +#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) + +/** + * Force minimum cache line alignment. + */ +#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE) + +typedef uint64_t phys_addr_t; /**< Physical address. */ +#define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1) +/** + * IO virtual address type. + * When the physical addressing mode (IOVA as PA) is in use, + * the translation from an IO virtual address (IOVA) to a physical address + * is a direct mapping, i.e. the same value. + * Otherwise, in virtual mode (IOVA as VA), an IOMMU may do the translation. + */ +typedef uint64_t rte_iova_t; +#define RTE_BAD_IOVA ((rte_iova_t)-1) + +/** + * Physical memory segment descriptor. + */ +#define RTE_MEMSEG_FLAG_DO_NOT_FREE (1 << 0) +/**< Prevent this segment from being freed back to the OS. */ +struct rte_memseg { + RTE_STD_C11 + union { + phys_addr_t phys_addr; /**< deprecated - Start physical address. */ + rte_iova_t iova; /**< Start IO address. */ + }; + RTE_STD_C11 + union { + void *addr; /**< Start virtual address. */ + uint64_t addr_64; /**< Makes sure addr is always 64 bits */ + }; + size_t len; /**< Length of the segment. */ + uint64_t hugepage_sz; /**< The pagesize of underlying memory */ + int32_t socket_id; /**< NUMA socket ID. */ + uint32_t nchannel; /**< Number of channels. */ + uint32_t nrank; /**< Number of ranks. */ + uint32_t flags; /**< Memseg-specific flags */ +} __rte_packed; + +/** + * Lock page in physical memory and prevent from swapping. + * + * @param virt + * The virtual address. + * @return + * 0 on success, negative on error. + */ +int rte_mem_lock_page(const void *virt); + +/** + * Get physical address of any mapped virtual address in the current process. + * It is found by browsing the /proc/self/pagemap special file. + * The page must be locked. + * + * @param virt + * The virtual address. + * @return + * The physical address or RTE_BAD_IOVA on error. + */ +phys_addr_t rte_mem_virt2phy(const void *virt); + +/** + * Get IO virtual address of any mapped virtual address in the current process. + * + * @param virt + * The virtual address. + * @return + * The IO address or RTE_BAD_IOVA on error. + */ +rte_iova_t rte_mem_virt2iova(const void *virt); + +/** + * Get virtual memory address corresponding to iova address. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param iova + * The iova address. + * @return + * Virtual address corresponding to iova address (or NULL if address does not + * exist within DPDK memory map). + */ +__rte_experimental void * +rte_mem_iova2virt(rte_iova_t iova); + +/** + * Get memseg to which a particular virtual address belongs. + * + * @param virt + * The virtual address. + * @param msl + * The memseg list in which to look up based on ``virt`` address + * (can be NULL). + * @return + * Memseg pointer on success, or NULL on error. + */ +__rte_experimental struct rte_memseg * +rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl); + +/** + * Get memseg list corresponding to virtual memory address. + * + * @param virt + * The virtual address. + * @return + * Memseg list to which this virtual address belongs to. + */ +__rte_experimental struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *virt); + +/** + * Memseg walk function prototype. + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg); + +/** + * Memseg contig walk function prototype. This will trigger a callback on every + * VA-contiguous are starting at memseg ``ms``, so total valid VA space at each + * callback call will be [``ms->addr``, ``ms->addr + len``). + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg); + +/** + * Memseg list walk function prototype. This will trigger a callback on every + * allocated memseg list. + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, + void *arg); + +/** + * Walk list of all memsegs. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_walk(rte_memseg_walk_t func, void *arg); + +/** + * Walk each VA-contiguous area. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); + +/** + * Walk each allocated memseg list. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg); + +/** + * Walk list of all memsegs without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg); + +/** + * Walk each VA-contiguous area without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg); + +/** + * Walk each allocated memseg list without performing any locking. + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg); + +/** + * Dump the physical memory layout to a file. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param f + * A pointer to a file for output + */ +void rte_dump_physmem_layout(FILE *f); + +/** + * Get the total amount of available physical memory. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @return + * The total amount of available physical memory in bytes. + */ +uint64_t rte_eal_get_physmem_size(void); + +/** + * Get the number of memory channels. + * + * @return + * The number of memory channels on the system. The value is 0 if unknown + * or not the same on all devices. + */ +unsigned rte_memory_get_nchannel(void); + +/** + * Get the number of memory ranks. + * + * @return + * The number of memory ranks on the system. The value is 0 if unknown or + * not the same on all devices. + */ +unsigned rte_memory_get_nrank(void); + +/** + * Drivers based on uio will not load unless physical + * addresses are obtainable. It is only possible to get + * physical addresses when running as a privileged user. + * + * @return + * 1 if the system is able to obtain physical addresses. + * 0 if using DMA addresses through an IOMMU. + */ +int rte_eal_using_phys_addrs(void); + + +/** + * Enum indicating which kind of memory event has happened. Used by callbacks to + * distinguish between memory allocations and deallocations. + */ +enum rte_mem_event { + RTE_MEM_EVENT_ALLOC = 0, /**< Allocation event. */ + RTE_MEM_EVENT_FREE, /**< Deallocation event. */ +}; +#define RTE_MEM_EVENT_CALLBACK_NAME_LEN 64 +/**< maximum length of callback name */ + +/** + * Function typedef used to register callbacks for memory events. + */ +typedef void (*rte_mem_event_callback_t)(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg); + +/** + * Function used to register callbacks for memory events. + * + * @note callbacks will happen while memory hotplug subsystem is write-locked, + * therefore some functions (e.g. `rte_memseg_walk()`) will cause a + * deadlock when called from within such callbacks. + * + * @note mem event callbacks not being supported is an expected error condition, + * so user code needs to handle this situation. In these cases, return + * value will be -1, and rte_errno will be set to ENOTSUP. + * + * @param name + * Name associated with specified callback to be added to the list. + * + * @param clb + * Callback function pointer. + * + * @param arg + * Argument to pass to the callback. + * + * @return + * 0 on successful callback register + * -1 on unsuccessful callback register, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg); + +/** + * Function used to unregister callbacks for memory events. + * + * @param name + * Name associated with specified callback to be removed from the list. + * + * @param arg + * Argument to look for among callbacks with specified callback name. + * + * @return + * 0 on successful callback unregister + * -1 on unsuccessful callback unregister, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_event_callback_unregister(const char *name, void *arg); + + +#define RTE_MEM_ALLOC_VALIDATOR_NAME_LEN 64 +/**< maximum length of alloc validator name */ +/** + * Function typedef used to register memory allocation validation callbacks. + * + * Returning 0 will allow allocation attempt to continue. Returning -1 will + * prevent allocation from succeeding. + */ +typedef int (*rte_mem_alloc_validator_t)(int socket_id, + size_t cur_limit, size_t new_len); + +/** + * @brief Register validator callback for memory allocations. + * + * Callbacks registered by this function will be called right before memory + * allocator is about to trigger allocation of more pages from the system if + * said allocation will bring total memory usage above specified limit on + * specified socket. User will be able to cancel pending allocation if callback + * returns -1. + * + * @note callbacks will happen while memory hotplug subsystem is write-locked, + * therefore some functions (e.g. `rte_memseg_walk()`) will cause a + * deadlock when called from within such callbacks. + * + * @note validator callbacks not being supported is an expected error condition, + * so user code needs to handle this situation. In these cases, return + * value will be -1, and rte_errno will be set to ENOTSUP. + * + * @param name + * Name associated with specified callback to be added to the list. + * + * @param clb + * Callback function pointer. + * + * @param socket_id + * Socket ID on which to watch for allocations. + * + * @param limit + * Limit above which to trigger callbacks. + * + * @return + * 0 on successful callback register + * -1 on unsuccessful callback register, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +/** + * @brief Unregister validator callback for memory allocations. + * + * @param name + * Name associated with specified callback to be removed from the list. + * + * @param socket_id + * Socket ID on which to watch for allocations. + * + * @return + * 0 on successful callback unregister + * -1 on unsuccessful callback unregister, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_alloc_validator_unregister(const char *name, int socket_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMORY_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_memzone.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_memzone.h new file mode 100644 index 00000000..f478fa9e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_memzone.h @@ -0,0 +1,320 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_MEMZONE_H_ +#define _RTE_MEMZONE_H_ + +/** + * @file + * RTE Memzone + * + * The goal of the memzone allocator is to reserve contiguous + * portions of physical memory. These zones are identified by a name. + * + * The memzone descriptors are shared by all partitions and are + * located in a known place of physical memory. This zone is accessed + * using rte_eal_get_configuration(). The lookup (by name) of a + * memory zone can be done in any partition and returns the same + * physical address. + * + * A reserved memory zone cannot be unreserved. The reservation shall + * be done at initialization time only. + */ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_MEMZONE_2MB 0x00000001 /**< Use 2MB pages. */ +#define RTE_MEMZONE_1GB 0x00000002 /**< Use 1GB pages. */ +#define RTE_MEMZONE_16MB 0x00000100 /**< Use 16MB pages. */ +#define RTE_MEMZONE_16GB 0x00000200 /**< Use 16GB pages. */ +#define RTE_MEMZONE_256KB 0x00010000 /**< Use 256KB pages. */ +#define RTE_MEMZONE_256MB 0x00020000 /**< Use 256MB pages. */ +#define RTE_MEMZONE_512MB 0x00040000 /**< Use 512MB pages. */ +#define RTE_MEMZONE_4GB 0x00080000 /**< Use 4GB pages. */ +#define RTE_MEMZONE_SIZE_HINT_ONLY 0x00000004 /**< Use available page size */ +#define RTE_MEMZONE_IOVA_CONTIG 0x00100000 /**< Ask for IOVA-contiguous memzone. */ + +/** + * A structure describing a memzone, which is a contiguous portion of + * physical memory identified by a name. + */ +struct rte_memzone { + +#define RTE_MEMZONE_NAMESIZE 32 /**< Maximum length of memory zone name.*/ + char name[RTE_MEMZONE_NAMESIZE]; /**< Name of the memory zone. */ + + RTE_STD_C11 + union { + phys_addr_t phys_addr; /**< deprecated - Start physical address. */ + rte_iova_t iova; /**< Start IO address. */ + }; + RTE_STD_C11 + union { + void *addr; /**< Start virtual address. */ + uint64_t addr_64; /**< Makes sure addr is always 64-bits */ + }; + size_t len; /**< Length of the memzone. */ + + uint64_t hugepage_sz; /**< The page size of underlying memory */ + + int32_t socket_id; /**< NUMA socket ID. */ + + uint32_t flags; /**< Characteristics of this memzone. */ +} __attribute__((__packed__)); + +/** + * Reserve a portion of physical memory. + * + * This function reserves some memory and returns a pointer to a + * correctly filled memzone descriptor. If the allocation cannot be + * done, return NULL. + * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. + * + * @param name + * The name of the memzone. If it already exists, the function will + * fail and return NULL. + * @param len + * The size of the memory to be reserved. If it + * is 0, the biggest contiguous zone will be reserved. + * @param socket_id + * The socket identifier in the case of + * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The flags parameter is used to request memzones to be + * taken from specifically sized hugepages. + * - RTE_MEMZONE_2MB - Reserved from 2MB pages + * - RTE_MEMZONE_1GB - Reserved from 1GB pages + * - RTE_MEMZONE_16MB - Reserved from 16MB pages + * - RTE_MEMZONE_16GB - Reserved from 16GB pages + * - RTE_MEMZONE_256KB - Reserved from 256KB pages + * - RTE_MEMZONE_256MB - Reserved from 256MB pages + * - RTE_MEMZONE_512MB - Reserved from 512MB pages + * - RTE_MEMZONE_4GB - Reserved from 4GB pages + * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if + * the requested page size is unavailable. + * If this flag is not set, the function + * will return error on an unavailable size + * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. + * @return + * A pointer to a correctly-filled read-only memzone descriptor, or NULL + * on error. + * On error case, rte_errno will be set appropriately: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +const struct rte_memzone *rte_memzone_reserve(const char *name, + size_t len, int socket_id, + unsigned flags); + +/** + * Reserve a portion of physical memory with alignment on a specified + * boundary. + * + * This function reserves some memory with alignment on a specified + * boundary, and returns a pointer to a correctly filled memzone + * descriptor. If the allocation cannot be done or if the alignment + * is not a power of 2, returns NULL. + * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. + * + * @param name + * The name of the memzone. If it already exists, the function will + * fail and return NULL. + * @param len + * The size of the memory to be reserved. If it + * is 0, the biggest contiguous zone will be reserved. + * @param socket_id + * The socket identifier in the case of + * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The flags parameter is used to request memzones to be + * taken from specifically sized hugepages. + * - RTE_MEMZONE_2MB - Reserved from 2MB pages + * - RTE_MEMZONE_1GB - Reserved from 1GB pages + * - RTE_MEMZONE_16MB - Reserved from 16MB pages + * - RTE_MEMZONE_16GB - Reserved from 16GB pages + * - RTE_MEMZONE_256KB - Reserved from 256KB pages + * - RTE_MEMZONE_256MB - Reserved from 256MB pages + * - RTE_MEMZONE_512MB - Reserved from 512MB pages + * - RTE_MEMZONE_4GB - Reserved from 4GB pages + * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if + * the requested page size is unavailable. + * If this flag is not set, the function + * will return error on an unavailable size + * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. + * @param align + * Alignment for resulting memzone. Must be a power of 2. + * @return + * A pointer to a correctly-filled read-only memzone descriptor, or NULL + * on error. + * On error case, rte_errno will be set appropriately: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +const struct rte_memzone *rte_memzone_reserve_aligned(const char *name, + size_t len, int socket_id, + unsigned flags, unsigned align); + +/** + * Reserve a portion of physical memory with specified alignment and + * boundary. + * + * This function reserves some memory with specified alignment and + * boundary, and returns a pointer to a correctly filled memzone + * descriptor. If the allocation cannot be done or if the alignment + * or boundary are not a power of 2, returns NULL. + * Memory buffer is reserved in a way, that it wouldn't cross specified + * boundary. That implies that requested length should be less or equal + * then boundary. + * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note: When reserving memzones with len set to 0, it is preferable to also + * set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but + * will likely not yield expected results. Specifically, the resulting memzone + * may not necessarily be the biggest memzone available, but rather biggest + * memzone available on socket id corresponding to an lcore from which + * reservation was called. + * + * @param name + * The name of the memzone. If it already exists, the function will + * fail and return NULL. + * @param len + * The size of the memory to be reserved. If it + * is 0, the biggest contiguous zone will be reserved. + * @param socket_id + * The socket identifier in the case of + * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The flags parameter is used to request memzones to be + * taken from specifically sized hugepages. + * - RTE_MEMZONE_2MB - Reserved from 2MB pages + * - RTE_MEMZONE_1GB - Reserved from 1GB pages + * - RTE_MEMZONE_16MB - Reserved from 16MB pages + * - RTE_MEMZONE_16GB - Reserved from 16GB pages + * - RTE_MEMZONE_256KB - Reserved from 256KB pages + * - RTE_MEMZONE_256MB - Reserved from 256MB pages + * - RTE_MEMZONE_512MB - Reserved from 512MB pages + * - RTE_MEMZONE_4GB - Reserved from 4GB pages + * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if + * the requested page size is unavailable. + * If this flag is not set, the function + * will return error on an unavailable size + * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. + * @param align + * Alignment for resulting memzone. Must be a power of 2. + * @param bound + * Boundary for resulting memzone. Must be a power of 2 or zero. + * Zero value implies no boundary condition. + * @return + * A pointer to a correctly-filled read-only memzone descriptor, or NULL + * on error. + * On error case, rte_errno will be set appropriately: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +const struct rte_memzone *rte_memzone_reserve_bounded(const char *name, + size_t len, int socket_id, + unsigned flags, unsigned align, unsigned bound); + +/** + * Free a memzone. + * + * @param mz + * A pointer to the memzone + * @return + * -EINVAL - invalid parameter. + * 0 - success + */ +int rte_memzone_free(const struct rte_memzone *mz); + +/** + * Lookup for a memzone. + * + * Get a pointer to a descriptor of an already reserved memory + * zone identified by the name given as an argument. + * + * @param name + * The name of the memzone. + * @return + * A pointer to a read-only memzone descriptor. + */ +const struct rte_memzone *rte_memzone_lookup(const char *name); + +/** + * Dump all reserved memzones to a file. + * + * @param f + * A pointer to a file for output + */ +void rte_memzone_dump(FILE *f); + +/** + * Walk list of all memzones + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + */ +void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *arg), + void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMZONE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h new file mode 100644 index 00000000..e12c2208 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0) + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_PCI_DEV_DEFS_H_ +#define _RTE_PCI_DEV_DEFS_H_ + +/* interrupt mode */ +enum rte_intr_mode { + RTE_INTR_MODE_NONE = 0, + RTE_INTR_MODE_LEGACY, + RTE_INTR_MODE_MSI, + RTE_INTR_MODE_MSIX +}; + +#endif /* _RTE_PCI_DEV_DEFS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_features.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_features.h new file mode 100644 index 00000000..6104123d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_pci_dev_features.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0) + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_PCI_DEV_FEATURES_H +#define _RTE_PCI_DEV_FEATURES_H + +#include + +#define RTE_INTR_MODE_NONE_NAME "none" +#define RTE_INTR_MODE_LEGACY_NAME "legacy" +#define RTE_INTR_MODE_MSI_NAME "msi" +#define RTE_INTR_MODE_MSIX_NAME "msix" + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_per_lcore.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_per_lcore.h new file mode 100644 index 00000000..eaedf0cb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_per_lcore.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_PER_LCORE_H_ +#define _RTE_PER_LCORE_H_ + +/** + * @file + * + * Per-lcore variables in RTE + * + * This file defines an API for instantiating per-lcore "global + * variables" that are environment-specific. Note that in all + * environments, a "shared variable" is the default when you use a + * global variable. + * + * Parts of this are execution environment specific. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * Macro to define a per lcore variable "var" of type "type", don't + * use keywords like "static" or "volatile" in type, just prefix the + * whole macro. + */ +#define RTE_DEFINE_PER_LCORE(type, name) \ + __thread __typeof__(type) per_lcore_##name + +/** + * Macro to declare an extern per lcore variable "var" of type "type" + */ +#define RTE_DECLARE_PER_LCORE(type, name) \ + extern __thread __typeof__(type) per_lcore_##name + +/** + * Read/write the per-lcore variable value + */ +#define RTE_PER_LCORE(name) (per_lcore_##name) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PER_LCORE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_random.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_random.h new file mode 100644 index 00000000..b2ca1c20 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_random.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_RANDOM_H_ +#define _RTE_RANDOM_H_ + +/** + * @file + * + * Pseudo-random Generators in RTE + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/** + * Seed the pseudo-random generator. + * + * The generator is automatically seeded by the EAL init with a timer + * value. It may need to be re-seeded by the user with a real random + * value. + * + * @param seedval + * The value of the seed. + */ +static inline void +rte_srand(uint64_t seedval) +{ + srand48((long)seedval); +} + +/** + * Get a pseudo-random value. + * + * This function generates pseudo-random numbers using the linear + * congruential algorithm and 48-bit integer arithmetic, called twice + * to generate a 64-bit value. + * + * @return + * A pseudo-random value between 0 and (1<<64)-1. + */ +static inline uint64_t +rte_rand(void) +{ + uint64_t val; + val = (uint64_t)lrand48(); + val <<= 32; + val += (uint64_t)lrand48(); + return val; +} + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_RANDOM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_reciprocal.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_reciprocal.h new file mode 100644 index 00000000..3492c73b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_reciprocal.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ +/* + * Reciprocal divide + * + * Used with permission from original authors + * Hannes Frederic Sowa and Daniel Borkmann + * + * This algorithm is based on the paper "Division by Invariant + * Integers Using Multiplication" by Torbjörn Granlund and Peter + * L. Montgomery. + * + * The assembler implementation from Agner Fog, which this code is + * based on, can be found here: + * http://www.agner.org/optimize/asmlib.zip + * + * This optimization for A/B is helpful if the divisor B is mostly + * runtime invariant. The reciprocal of B is calculated in the + * slow-path with reciprocal_value(). The fast-path can then just use + * a much faster multiplication operation with a variable dividend A + * to calculate the division A/B. + */ + +#ifndef _RTE_RECIPROCAL_H_ +#define _RTE_RECIPROCAL_H_ + +#include + +struct rte_reciprocal { + uint32_t m; + uint8_t sh1, sh2; +}; + +struct rte_reciprocal_u64 { + uint64_t m; + uint8_t sh1, sh2; +}; + +static inline uint32_t rte_reciprocal_divide(uint32_t a, struct rte_reciprocal R) +{ + uint32_t t = (uint32_t)(((uint64_t)a * R.m) >> 32); + + return (t + ((a - t) >> R.sh1)) >> R.sh2; +} + +static __rte_always_inline uint64_t +mullhi_u64(uint64_t x, uint64_t y) +{ +#ifdef __SIZEOF_INT128__ + __uint128_t xl = x; + __uint128_t rl = xl * y; + + return (rl >> 64); +#else + uint64_t u0, u1, v0, v1, k, t; + uint64_t w1, w2; + uint64_t whi; + + u1 = x >> 32; u0 = x & 0xFFFFFFFF; + v1 = y >> 32; v0 = y & 0xFFFFFFFF; + + t = u0*v0; + k = t >> 32; + + t = u1*v0 + k; + w1 = t & 0xFFFFFFFF; + w2 = t >> 32; + + t = u0*v1 + w1; + k = t >> 32; + + whi = u1*v1 + w2 + k; + + return whi; +#endif +} + +static __rte_always_inline uint64_t +rte_reciprocal_divide_u64(uint64_t a, struct rte_reciprocal_u64 *R) +{ + uint64_t t = mullhi_u64(a, R->m); + + return (t + ((a - t) >> R->sh1)) >> R->sh2; +} + +struct rte_reciprocal rte_reciprocal_value(uint32_t d); +struct rte_reciprocal_u64 rte_reciprocal_value_u64(uint64_t d); + +#endif /* _RTE_RECIPROCAL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_service.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_service.h new file mode 100644 index 00000000..34b41aff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_service.h @@ -0,0 +1,427 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_SERVICE_H_ +#define _RTE_SERVICE_H_ + +/** + * @file + * + * Service functions + * + * The service functionality provided by this header allows a DPDK component + * to indicate that it requires a function call in order for it to perform + * its processing. + * + * An example usage of this functionality would be a component that registers + * a service to perform a particular packet processing duty: for example the + * eventdev software PMD. At startup the application requests all services + * that have been registered, and the cores in the service-coremask run the + * required services. The EAL removes these number of cores from the available + * runtime cores, and dedicates them to performing service-core workloads. The + * application has access to the remaining lcores as normal. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include +#include + +#define RTE_SERVICE_NAME_MAX 32 + +/* Capabilities of a service. + * + * Use the *rte_service_probe_capability* function to check if a service is + * capable of a specific capability. + */ +/** When set, the service is capable of having multiple threads run it at the + * same time. + */ +#define RTE_SERVICE_CAP_MT_SAFE (1 << 0) + +/** + * Return the number of services registered. + * + * The number of services registered can be passed to *rte_service_get_by_id*, + * enabling the application to retrieve the specification of each service. + * + * @return The number of services registered. + */ +uint32_t rte_service_get_count(void); + +/** + * Return the id of a service by name. + * + * This function provides the id of the service using the service name as + * lookup key. The service id is to be passed to other functions in the + * rte_service_* API. + * + * Example usage: + * @code + * uint32_t service_id; + * int32_t ret = rte_service_get_by_name("service_X", &service_id); + * if (ret) { + * // handle error + * } + * @endcode + * + * @param name The name of the service to retrieve + * @param[out] service_id A pointer to a uint32_t, to be filled in with the id. + * @retval 0 Success. The service id is provided in *service_id*. + * @retval -EINVAL Null *service_id* pointer provided + * @retval -ENODEV No such service registered + */ +int32_t rte_service_get_by_name(const char *name, uint32_t *service_id); + +/** + * Return the name of the service. + * + * @return A pointer to the name of the service. The returned pointer remains + * in ownership of the service, and the application must not free it. + */ +const char *rte_service_get_name(uint32_t id); + +/** + * Check if a service has a specific capability. + * + * This function returns if *service* has implements *capability*. + * See RTE_SERVICE_CAP_* defines for a list of valid capabilities. + * @retval 1 Capability supported by this service instance + * @retval 0 Capability not supported by this service instance + */ +int32_t rte_service_probe_capability(uint32_t id, uint32_t capability); + +/** + * Map or unmap a lcore to a service. + * + * Each core can be added or removed from running a specific service. This + * function enables or disables *lcore* to run *service_id*. + * + * If multiple cores are enabled on a service, an atomic is used to ensure that + * only one cores runs the service at a time. The exception to this is when + * a service indicates that it is multi-thread safe by setting the capability + * called RTE_SERVICE_CAP_MT_SAFE. With the multi-thread safe capability set, + * the service function can be run on multiple threads at the same time. + * + * @param service_id the service to apply the lcore to + * @param lcore The lcore that will be mapped to service + * @param enable Zero to unmap or disable the core, non-zero to enable + * + * @retval 0 lcore map updated successfully + * @retval -EINVAL An invalid service or lcore was provided. + */ +int32_t rte_service_map_lcore_set(uint32_t service_id, uint32_t lcore, + uint32_t enable); + +/** + * Retrieve the mapping of an lcore to a service. + * + * @param service_id the service to apply the lcore to + * @param lcore The lcore that will be mapped to service + * + * @retval 1 lcore is mapped to service + * @retval 0 lcore is not mapped to service + * @retval -EINVAL An invalid service or lcore was provided. + */ +int32_t rte_service_map_lcore_get(uint32_t service_id, uint32_t lcore); + +/** + * Set the runstate of the service. + * + * Each service is either running or stopped. Setting a non-zero runstate + * enables the service to run, while setting runstate zero disables it. + * + * @param id The id of the service + * @param runstate The run state to apply to the service + * + * @retval 0 The service was successfully started + * @retval -EINVAL Invalid service id + */ +int32_t rte_service_runstate_set(uint32_t id, uint32_t runstate); + +/** + * Get the runstate for the service with *id*. See *rte_service_runstate_set* + * for details of runstates. A service can call this function to ensure that + * the application has indicated that it will receive CPU cycles. Either a + * service-core is mapped (default case), or the application has explicitly + * disabled the check that a service-cores is mapped to the service and takes + * responsibility to run the service manually using the available function + * *rte_service_run_iter_on_app_lcore* to do so. + * + * @retval 1 Service is running + * @retval 0 Service is stopped + * @retval -EINVAL Invalid service id + */ +int32_t rte_service_runstate_get(uint32_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * This function returns whether the service may be currently executing on + * at least one lcore, or definitely is not. This function can be used to + * determine if, after setting the service runstate to stopped, the service + * is still executing a service lcore. + * + * Care must be taken if calling this function when the service runstate is + * running, since the result of this function may be incorrect by the time the + * function returns due to service cores running in parallel. + * + * @retval 1 Service may be running on one or more lcores + * @retval 0 Service is not running on any lcore + * @retval -EINVAL Invalid service id + */ +int32_t __rte_experimental +rte_service_may_be_active(uint32_t id); + +/** + * Enable or disable the check for a service-core being mapped to the service. + * An application can disable the check when takes the responsibility to run a + * service itself using *rte_service_run_iter_on_app_lcore*. + * + * @param id The id of the service to set the check on + * @param enable When zero, the check is disabled. Non-zero enables the check. + * + * @retval 0 Success + * @retval -EINVAL Invalid service ID + */ +int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enable); + +/** + * This function runs a service callback from a non-service lcore. + * + * This function is designed to enable gradual porting to service cores, and + * to enable unit tests to verify a service behaves as expected. + * + * When called, this function ensures that the service identified by *id* is + * safe to run on this lcore. Multi-thread safe services are invoked even if + * other cores are simultaneously running them as they are multi-thread safe. + * + * Multi-thread unsafe services are handled depending on the variable + * *serialize_multithread_unsafe*: + * - When set, the function will check if a service is already being invoked + * on another lcore, refusing to run it and returning -EBUSY. + * - When zero, the application takes responsibility to ensure that the service + * indicated by *id* is not going to be invoked by another lcore. This setting + * avoids atomic operations, so is likely to be more performant. + * + * @param id The ID of the service to run + * @param serialize_multithread_unsafe This parameter indicates to the service + * cores library if it is required to use atomics to serialize access + * to mult-thread unsafe services. As there is an overhead in using + * atomics, applications can choose to enable or disable this feature + * + * Note that any thread calling this function MUST be a DPDK EAL thread, as + * the *rte_lcore_id* function is used to access internal data structures. + * + * @retval 0 Service was run on the calling thread successfully + * @retval -EBUSY Another lcore is executing the service, and it is not a + * multi-thread safe service, so the service was not run on this lcore + * @retval -ENOEXEC Service is not in a run-able state + * @retval -EINVAL Invalid service id + */ +int32_t rte_service_run_iter_on_app_lcore(uint32_t id, + uint32_t serialize_multithread_unsafe); + +/** + * Start a service core. + * + * Starting a core makes the core begin polling. Any services assigned to it + * will be run as fast as possible. The application must ensure that the lcore + * is in a launchable state: e.g. call *rte_eal_lcore_wait* on the lcore_id + * before calling this function. + * + * @retval 0 Success + * @retval -EINVAL Failed to start core. The *lcore_id* passed in is not + * currently assigned to be a service core. + */ +int32_t rte_service_lcore_start(uint32_t lcore_id); + +/** + * Stop a service core. + * + * Stopping a core makes the core become idle, but remains assigned as a + * service core. + * + * @retval 0 Success + * @retval -EINVAL Invalid *lcore_id* provided + * @retval -EALREADY Already stopped core + * @retval -EBUSY Failed to stop core, as it would cause a service to not + * be run, as this is the only core currently running the service. + * The application must stop the service first, and then stop the + * lcore. + */ +int32_t rte_service_lcore_stop(uint32_t lcore_id); + +/** + * Adds lcore to the list of service cores. + * + * This functions can be used at runtime in order to modify the service core + * mask. + * + * @retval 0 Success + * @retval -EBUSY lcore is busy, and not available for service core duty + * @retval -EALREADY lcore is already added to the service core list + * @retval -EINVAL Invalid lcore provided + */ +int32_t rte_service_lcore_add(uint32_t lcore); + +/** + * Removes lcore from the list of service cores. + * + * This can fail if the core is not stopped, see *rte_service_core_stop*. + * + * @retval 0 Success + * @retval -EBUSY Lcore is not stopped, stop service core before removing. + * @retval -EINVAL failed to add lcore to service core mask. + */ +int32_t rte_service_lcore_del(uint32_t lcore); + +/** + * Retrieve the number of service cores currently available. + * + * This function returns the integer count of service cores available. The + * service core count can be used in mapping logic when creating mappings + * from service cores to services. + * + * See *rte_service_lcore_list* for details on retrieving the lcore_id of each + * service core. + * + * @return The number of service cores currently configured. + */ +int32_t rte_service_lcore_count(void); + +/** + * Resets all service core mappings. This does not remove the service cores + * from duty, just unmaps all services / cores, and stops() the service cores. + * The runstate of services is not modified. + * + * @retval 0 Success + */ +int32_t rte_service_lcore_reset_all(void); + +/** + * Enable or disable statistics collection for *service*. + * + * This function enables per core, per-service cycle count collection. + * @param id The service to enable statistics gathering on. + * @param enable Zero to disable statistics, non-zero to enable. + * @retval 0 Success + * @retval -EINVAL Invalid service pointer passed + */ +int32_t rte_service_set_stats_enable(uint32_t id, int32_t enable); + +/** + * Retrieve the list of currently enabled service cores. + * + * This function fills in an application supplied array, with each element + * indicating the lcore_id of a service core. + * + * Adding and removing service cores can be performed using + * *rte_service_lcore_add* and *rte_service_lcore_del*. + * @param [out] array An array of at least *rte_service_lcore_count* items. + * If statically allocating the buffer, use RTE_MAX_LCORE. + * @param [out] n The size of *array*. + * @retval >=0 Number of service cores that have been populated in the array + * @retval -ENOMEM The provided array is not large enough to fill in the + * service core list. No items have been populated, call this function + * with a size of at least *rte_service_core_count* items. + */ +int32_t rte_service_lcore_list(uint32_t array[], uint32_t n); + +/** + * Get the numer of services running on the supplied lcore. + * + * @param lcore Id of the service core. + * @retval >=0 Number of services registered to this core. + * @retval -EINVAL Invalid lcore provided + * @retval -ENOTSUP The provided lcore is not a service core. + */ +int32_t rte_service_lcore_count_services(uint32_t lcore); + +/** + * Dumps any information available about the service. When id is UINT32_MAX, + * this function dumps info for all services. + * + * @retval 0 Statistics have been successfully dumped + * @retval -EINVAL Invalid service id provided + */ +int32_t rte_service_dump(FILE *f, uint32_t id); + +/** + * Returns the number of cycles that this service has consumed + */ +#define RTE_SERVICE_ATTR_CYCLES 0 + +/** + * Returns the count of invocations of this service function + */ +#define RTE_SERVICE_ATTR_CALL_COUNT 1 + +/** + * Get an attribute from a service. + * + * @retval 0 Success, the attribute value has been written to *attr_value*. + * -EINVAL Invalid id, attr_id or attr_value was NULL. + */ +int32_t rte_service_attr_get(uint32_t id, uint32_t attr_id, + uint32_t *attr_value); + +/** + * Reset all attribute values of a service. + * + * @param id The service to reset all statistics of + * @retval 0 Successfully reset attributes + * -EINVAL Invalid service id provided + */ +int32_t rte_service_attr_reset_all(uint32_t id); + +/** + * Returns the number of times the service runner has looped. + */ +#define RTE_SERVICE_LCORE_ATTR_LOOPS 0 + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Get an attribute from a service core. + * + * @param lcore Id of the service core. + * @param attr_id Id of the attribute to be retrieved. + * @param [out] attr_value Pointer to storage in which to write retrieved value. + * @retval 0 Success, the attribute value has been written to *attr_value*. + * -EINVAL Invalid lcore, attr_id or attr_value was NULL. + * -ENOTSUP lcore is not a service core. + */ +int32_t __rte_experimental +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset all attribute values of a service core. + * + * @param lcore The service core to reset all the statistics of + * @retval 0 Successfully reset attributes + * -EINVAL Invalid service id provided + * -ENOTSUP lcore is not a service core. + */ +int32_t __rte_experimental +rte_service_lcore_attr_reset_all(uint32_t lcore); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_SERVICE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_service_component.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_service_component.h new file mode 100644 index 00000000..c12adbc2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_service_component.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_SERVICE_PRIVATE_H_ +#define _RTE_SERVICE_PRIVATE_H_ + +/* This file specifies the internal service specification. + * Include this file if you are writing a component that requires CPU cycles to + * operate, and you wish to run the component using service cores + */ +#include +#include + +/** + * Signature of callback function to run a service. + */ +typedef int32_t (*rte_service_func)(void *args); + +/** + * The specification of a service. + * + * This struct contains metadata about the service itself, the callback + * function to run one iteration of the service, a userdata pointer, flags etc. + */ +struct rte_service_spec { + /** The name of the service. This should be used by the application to + * understand what purpose this service provides. + */ + char name[RTE_SERVICE_NAME_MAX]; + /** The callback to invoke to run one iteration of the service. */ + rte_service_func callback; + /** The userdata pointer provided to the service callback. */ + void *callback_userdata; + /** Flags to indicate the capabilities of this service. See defines in + * the public header file for values of RTE_SERVICE_CAP_* + */ + uint32_t capabilities; + /** NUMA socket ID that this service is affinitized to */ + int socket_id; +}; + +/** + * Register a new service. + * + * A service represents a component that the requires CPU time periodically to + * achieve its purpose. + * + * For example the eventdev SW PMD requires CPU cycles to perform its + * scheduling. This can be achieved by registering it as a service, and the + * application can then assign CPU resources to that service. + * + * Note that when a service component registers itself, it is not permitted to + * add or remove service-core threads, or modify lcore-to-service mappings. The + * only API that may be called by the service-component is + * *rte_service_component_runstate_set*, which indicates that the service + * component is ready to be executed. + * + * @param spec The specification of the service to register + * @param[out] service_id A pointer to a uint32_t, which will be filled in + * during registration of the service. It is set to the integers + * service number given to the service. This parameter may be NULL. + * @retval 0 Successfully registered the service. + * -EINVAL Attempted to register an invalid service (eg, no callback + * set) + */ +int32_t rte_service_component_register(const struct rte_service_spec *spec, + uint32_t *service_id); + +/** + * Unregister a service component. + * + * The service being removed must be stopped before calling this function. + * + * @retval 0 The service was successfully unregistered. + * @retval -EBUSY The service is currently running, stop the service before + * calling unregister. No action has been taken. + */ +int32_t rte_service_component_unregister(uint32_t id); + +/** + * Private function to allow EAL to initialized default mappings. + * + * This function iterates all the services, and maps then to the available + * cores. Based on the capabilities of the services, they are set to run on the + * available cores in a round-robin manner. + * + * @retval 0 Success + * @retval -ENOTSUP No service lcores in use + * @retval -EINVAL Error while iterating over services + * @retval -ENODEV Error in enabling service lcore on a service + * @retval -ENOEXEC Error when starting services + */ +int32_t rte_service_start_with_defaults(void); + +/** + * Set the backend runstate of a component. + * + * This function allows services to be registered at startup, but not yet + * enabled to run by default. When the service has been configured (via the + * usual method; eg rte_eventdev_configure, the service can mark itself as + * ready to run. The differentiation between backend runstate and + * service_runstate is that the backend runstate is set by the service + * component while the service runstate is reserved for application usage. + * + * @retval 0 Success + */ +int32_t rte_service_component_runstate_set(uint32_t id, uint32_t runstate); + +/** + * Initialize the service library. + * + * In order to use the service library, it must be initialized. EAL initializes + * the library at startup. + * + * @retval 0 Success + * @retval -EALREADY Service library is already initialized + */ +int32_t rte_service_init(void); + +/** + * @internal Free up the memory that has been initialized. + * This routine is to be invoked prior to process termination. + * + * @retval None + */ +void rte_service_finalize(void); + +#endif /* _RTE_SERVICE_PRIVATE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_string_fns.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_string_fns.h new file mode 100644 index 00000000..97597a14 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_string_fns.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * + * String-related functions as replacement for libc equivalents + */ + +#ifndef _RTE_STRING_FNS_H_ +#define _RTE_STRING_FNS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * Takes string "string" parameter and splits it at character "delim" + * up to maxtokens-1 times - to give "maxtokens" resulting tokens. Like + * strtok or strsep functions, this modifies its input string, by replacing + * instances of "delim" with '\\0'. All resultant tokens are returned in the + * "tokens" array which must have enough entries to hold "maxtokens". + * + * @param string + * The input string to be split into tokens + * + * @param stringlen + * The max length of the input buffer + * + * @param tokens + * The array to hold the pointers to the tokens in the string + * + * @param maxtokens + * The number of elements in the tokens array. At most, maxtokens-1 splits + * of the string will be done. + * + * @param delim + * The character on which the split of the data will be done + * + * @return + * The number of tokens in the tokens array. + */ +int +rte_strsplit(char *string, int stringlen, + char **tokens, int maxtokens, char delim); + +/** + * @internal + * DPDK-specific version of strlcpy for systems without + * libc or libbsd copies of the function + */ +static inline size_t +rte_strlcpy(char *dst, const char *src, size_t size) +{ + return (size_t)snprintf(dst, size, "%s", src); +} + +/* pull in a strlcpy function */ +#ifdef RTE_EXEC_ENV_BSDAPP +#include +#ifndef __BSD_VISIBLE /* non-standard functions are hidden */ +#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) +#endif + + +#else /* non-BSD platforms */ +#ifdef RTE_USE_LIBBSD +#include + +#else /* no BSD header files, create own */ +#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) + +#endif /* RTE_USE_LIBBSD */ +#endif /* BSDAPP */ + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_STRING_FNS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_tailq.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_tailq.h new file mode 100644 index 00000000..9b01abb2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_tailq.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_TAILQ_H_ +#define _RTE_TAILQ_H_ + +/** + * @file + * Here defines rte_tailq APIs for only internal use + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/** dummy structure type used by the rte_tailq APIs */ +struct rte_tailq_entry { + TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list */ + void *data; /**< Pointer to the data referenced by this tailq entry */ +}; +/** dummy */ +TAILQ_HEAD(rte_tailq_entry_head, rte_tailq_entry); + +#define RTE_TAILQ_NAMESIZE 32 + +/** + * The structure defining a tailq header entry for storing + * in the rte_config structure in shared memory. Each tailq + * is identified by name. + * Any library storing a set of objects e.g. rings, mempools, hash-tables, + * is recommended to use an entry here, so as to make it easy for + * a multi-process app to find already-created elements in shared memory. + */ +struct rte_tailq_head { + struct rte_tailq_entry_head tailq_head; /**< NOTE: must be first element */ + char name[RTE_TAILQ_NAMESIZE]; +}; + +struct rte_tailq_elem { + /** + * Reference to head in shared mem, updated at init time by + * rte_eal_tailqs_init() + */ + struct rte_tailq_head *head; + TAILQ_ENTRY(rte_tailq_elem) next; + const char name[RTE_TAILQ_NAMESIZE]; +}; + +/** + * Return the first tailq entry casted to the right struct. + */ +#define RTE_TAILQ_CAST(tailq_entry, struct_name) \ + (struct struct_name *)&(tailq_entry)->tailq_head + +/** + * Utility macro to make looking up a tailqueue for a particular struct easier. + * + * @param name + * The name of tailq + * + * @param struct_name + * The name of the list type we are using. (Generally this is the same as the + * first parameter passed to TAILQ_HEAD macro) + * + * @return + * The return value from rte_eal_tailq_lookup, typecast to the appropriate + * structure pointer type. + * NULL on error, since the tailq_head is the first + * element in the rte_tailq_head structure. + */ +#define RTE_TAILQ_LOOKUP(name, struct_name) \ + RTE_TAILQ_CAST(rte_eal_tailq_lookup(name), struct_name) + +/** + * Dump tail queues to a file. + * + * @param f + * A pointer to a file for output + */ +void rte_dump_tailq(FILE *f); + +/** + * Lookup for a tail queue. + * + * Get a pointer to a tail queue header of a tail + * queue identified by the name given as an argument. + * Note: this function is not multi-thread safe, and should only be called from + * a single thread at a time + * + * @param name + * The name of the queue. + * @return + * A pointer to the tail queue head structure. + */ +struct rte_tailq_head *rte_eal_tailq_lookup(const char *name); + +/** + * Register a tail queue. + * + * Register a tail queue from shared memory. + * This function is mainly used by EAL_REGISTER_TAILQ macro which is used to + * register tailq from the different dpdk libraries. Since this macro is a + * constructor, the function has no access to dpdk shared memory, so the + * registered tailq can not be used before call to rte_eal_init() which calls + * rte_eal_tailqs_init(). + * + * @param t + * The tailq element which contains the name of the tailq you want to + * create (/retrieve when in secondary process). + * @return + * 0 on success or -1 in case of an error. + */ +int rte_eal_tailq_register(struct rte_tailq_elem *t); + +#define EAL_REGISTER_TAILQ(t) \ +RTE_INIT(tailqinitfn_ ##t) \ +{ \ + if (rte_eal_tailq_register(&t) < 0) \ + rte_panic("Cannot initialize tailq: %s\n", t.name); \ +} + +/* This macro permits both remove and free var within the loop safely.*/ +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TAILQ_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_test.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_test.h new file mode 100644 index 00000000..89e47f47 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_test.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_TEST_H_ +#define _RTE_TEST_H_ + +#include + +/* Before including rte_test.h file you can define + * RTE_TEST_TRACE_FAILURE(_file, _line, _func) macro to better trace/debug test + * failures. Mostly useful in development phase. + */ +#ifndef RTE_TEST_TRACE_FAILURE +#define RTE_TEST_TRACE_FAILURE(_file, _line, _func) +#endif + + +#define RTE_TEST_ASSERT(cond, msg, ...) do { \ + if (!(cond)) { \ + RTE_LOG(DEBUG, EAL, "Test assert %s line %d failed: " \ + msg "\n", __func__, __LINE__, ##__VA_ARGS__); \ + RTE_TEST_TRACE_FAILURE(__FILE__, __LINE__, __func__); \ + return -1; \ + } \ +} while (0) + +#define RTE_TEST_ASSERT_EQUAL(a, b, msg, ...) \ + RTE_TEST_ASSERT(a == b, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_NOT_EQUAL(a, b, msg, ...) \ + RTE_TEST_ASSERT(a != b, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_SUCCESS(val, msg, ...) \ + RTE_TEST_ASSERT(val == 0, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_FAIL(val, msg, ...) \ + RTE_TEST_ASSERT(val != 0, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_NULL(val, msg, ...) \ + RTE_TEST_ASSERT(val == NULL, msg, ##__VA_ARGS__) + +#define RTE_TEST_ASSERT_NOT_NULL(val, msg, ...) \ + RTE_TEST_ASSERT(val != NULL, msg, ##__VA_ARGS__) + +#endif /* _RTE_TEST_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_time.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_time.h new file mode 100644 index 00000000..5ad7c884 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_time.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#ifndef _RTE_TIME_H_ +#define _RTE_TIME_H_ + +#include +#include + +#define NSEC_PER_SEC 1000000000L + +/** + * Structure to hold the parameters of a running cycle counter to assist + * in converting cycles to nanoseconds. + */ +struct rte_timecounter { + /** Last cycle counter value read. */ + uint64_t cycle_last; + /** Nanoseconds count. */ + uint64_t nsec; + /** Bitmask separating nanosecond and sub-nanoseconds. */ + uint64_t nsec_mask; + /** Sub-nanoseconds count. */ + uint64_t nsec_frac; + /** Bitmask for two's complement subtraction of non-64 bit counters. */ + uint64_t cc_mask; + /** Cycle to nanosecond divisor (power of two). */ + uint32_t cc_shift; +}; + +/** + * Converts cyclecounter cycles to nanoseconds. + */ +static inline uint64_t +rte_cyclecounter_cycles_to_ns(struct rte_timecounter *tc, uint64_t cycles) +{ + uint64_t ns; + + /* Add fractional nanoseconds. */ + ns = cycles + tc->nsec_frac; + tc->nsec_frac = ns & tc->nsec_mask; + + /* Shift to get only nanoseconds. */ + return ns >> tc->cc_shift; +} + +/** + * Update the internal nanosecond count in the structure. + */ +static inline uint64_t +rte_timecounter_update(struct rte_timecounter *tc, uint64_t cycle_now) +{ + uint64_t cycle_delta, ns_offset; + + /* Calculate the delta since the last call. */ + if (tc->cycle_last <= cycle_now) + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc_mask; + else + /* Handle cycle counts that have wrapped around . */ + cycle_delta = (~(tc->cycle_last - cycle_now) & tc->cc_mask) + 1; + + /* Convert to nanoseconds. */ + ns_offset = rte_cyclecounter_cycles_to_ns(tc, cycle_delta); + + /* Store current cycle counter for next call. */ + tc->cycle_last = cycle_now; + + /* Update the nanosecond count. */ + tc->nsec += ns_offset; + + return tc->nsec; +} + +/** + * Convert from timespec structure into nanosecond units. + */ +static inline uint64_t +rte_timespec_to_ns(const struct timespec *ts) +{ + return ((uint64_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; +} + +/** + * Convert from nanosecond units into timespec structure. + */ +static inline struct timespec +rte_ns_to_timespec(uint64_t nsec) +{ + struct timespec ts = {0, 0}; + + if (nsec == 0) + return ts; + + ts.tv_sec = nsec / NSEC_PER_SEC; + ts.tv_nsec = nsec % NSEC_PER_SEC; + + return ts; +} + +#endif /* _RTE_TIME_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_uuid.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_uuid.h new file mode 100644 index 00000000..2c846b5f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_uuid.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997, 1998 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +/** + * @file + * + * UUID related functions originally from libuuid + */ + +#ifndef _RTE_UUID_H_ +#define _RTE_UUID_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * Struct describing a Universal Unique Identifer + */ +typedef unsigned char rte_uuid_t[16]; + +/** + * Helper for defining UUID values for id tables. + */ +#define RTE_UUID_INIT(a, b, c, d, e) { \ + ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, \ + ((a) >> 8) & 0xff, (a) & 0xff, \ + ((b) >> 8) & 0xff, (b) & 0xff, \ + ((c) >> 8) & 0xff, (c) & 0xff, \ + ((d) >> 8) & 0xff, (d) & 0xff, \ + ((e) >> 40) & 0xff, ((e) >> 32) & 0xff, \ + ((e) >> 24) & 0xff, ((e) >> 16) & 0xff, \ + ((e) >> 8) & 0xff, (e) & 0xff \ +} + +/** + * Test if UUID is all zeros. + * + * @param uu + * The uuid to check. + * @return + * true if uuid is NULL value, false otherwise + */ +bool rte_uuid_is_null(const rte_uuid_t uu); + +/** + * Copy uuid. + * + * @param dst + * Destination uuid + * @param src + * Source uuid + */ +static inline void rte_uuid_copy(rte_uuid_t dst, const rte_uuid_t src) +{ + memcpy(dst, src, sizeof(rte_uuid_t)); +} + +/** + * Compare two UUID's + * + * @param a + * A UUID to compare + * @param b + * A UUID to compare + * @return + * returns an integer less than, equal to, or greater than zero if UUID a is + * is less than, equal, or greater than UUID b. + */ +int rte_uuid_compare(const rte_uuid_t a, const rte_uuid_t b); + +/** + * Extract UUID from string + * + * @param in + * Pointer to string of characters to convert + * @param uu + * Destination UUID + * @return + * Returns 0 on succes, and -1 if string is not a valid UUID. + */ +int rte_uuid_parse(const char *in, rte_uuid_t uu); + +/** + * Convert UUID to string + * + * @param uu + * UUID to format + * @param out + * Resulting string buffer + * @param len + * Sizeof the available string buffer + */ +#define RTE_UUID_STRLEN (36 + 1) +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_UUID_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_version.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_version.h new file mode 100644 index 00000000..7c6714a2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_version.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Definitions of DPDK version numbers + */ + +#ifndef _RTE_VERSION_H_ +#define _RTE_VERSION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/** + * String that appears before the version number + */ +#define RTE_VER_PREFIX "DPDK" + +/** + * Major version/year number i.e. the yy in yy.mm.z + */ +#define RTE_VER_YEAR 18 + +/** + * Minor version/month number i.e. the mm in yy.mm.z + */ +#define RTE_VER_MONTH 8 + +/** + * Patch level number i.e. the z in yy.mm.z + */ +#define RTE_VER_MINOR 0 + +/** + * Extra string to be appended to version number + */ +#define RTE_VER_SUFFIX "" + +/** + * Patch release number + * 0-15 = release candidates + * 16 = release + */ +#define RTE_VER_RELEASE 16 + +/** + * Macro to compute a version number usable for comparisons + */ +#define RTE_VERSION_NUM(a,b,c,d) ((a) << 24 | (b) << 16 | (c) << 8 | (d)) + +/** + * All version numbers in one to compare with RTE_VERSION_NUM() + */ +#define RTE_VERSION RTE_VERSION_NUM( \ + RTE_VER_YEAR, \ + RTE_VER_MONTH, \ + RTE_VER_MINOR, \ + RTE_VER_RELEASE) + +/** + * Function returning version string + * @return + * string + */ +static inline const char * +rte_version(void) +{ + static char version[32]; + if (version[0] != 0) + return version; + if (strlen(RTE_VER_SUFFIX) == 0) + snprintf(version, sizeof(version), "%s %d.%02d.%d", + RTE_VER_PREFIX, + RTE_VER_YEAR, + RTE_VER_MONTH, + RTE_VER_MINOR); + else + snprintf(version, sizeof(version), "%s %d.%02d.%d%s%d", + RTE_VER_PREFIX, + RTE_VER_YEAR, + RTE_VER_MONTH, + RTE_VER_MINOR, + RTE_VER_SUFFIX, + RTE_VER_RELEASE < 16 ? + RTE_VER_RELEASE : + RTE_VER_RELEASE - 16); + return version; +} + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_VERSION_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/include/rte_vfio.h b/src/spdk/dpdk/lib/librte_eal/common/include/rte_vfio.h new file mode 100644 index 00000000..5ca13fcc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/include/rte_vfio.h @@ -0,0 +1,367 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 6WIND S.A. + */ + +#ifndef _RTE_VFIO_H_ +#define _RTE_VFIO_H_ + +/** + * @file + * RTE VFIO. This library provides various VFIO related utility functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * determine if VFIO is present on the system + */ +#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_PRESENT +#endif /* kernel version >= 3.6.0 */ +#endif /* RTE_EAL_VFIO */ + +#ifdef VFIO_PRESENT + +#include + +#define VFIO_DIR "/dev/vfio" +#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" +#define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" +#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) +#define VFIO_GET_REGION_IDX(x) (x >> 40) +#define VFIO_NOIOMMU_MODE \ + "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode" + +/* NOIOMMU is defined from kernel version 4.5 onwards */ +#ifdef VFIO_NOIOMMU_IOMMU +#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU +#else +#define RTE_VFIO_NOIOMMU 8 +#endif + +#else /* not VFIO_PRESENT */ + +/* we don't need an actual definition, only pointer is used */ +struct vfio_device_info; + +#endif /* VFIO_PRESENT */ + +/** + * Setup vfio_cfg for the device identified by its address. + * It discovers the configured I/O MMU groups or sets a new one for the device. + * If a new groups is assigned, the DMA mapping is performed. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param vfio_dev_fd + * VFIO fd. + * + * @param device_info + * Device information. + * + * @return + * 0 on success. + * <0 on failure. + * >1 if the device cannot be managed this way. + */ +int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info); + +/** + * Release a device mapped to a VFIO-managed I/O MMU group. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param fd + * VFIO fd. + * + * @return + * 0 on success. + * <0 on failure. + */ +int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd); + +/** + * Enable a VFIO-related kmod. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param modname + * kernel module name. + * + * @return + * 0 on success. + * <0 on failure. + */ +int rte_vfio_enable(const char *modname); + +/** + * Check whether a VFIO-related kmod is enabled. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param modname + * kernel module name. + * + * @return + * !0 if true. + * 0 otherwise. + */ +int rte_vfio_is_enabled(const char *modname); + +/** + * Whether VFIO NOIOMMU mode is enabled. + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @return + * !0 if true. + * 0 otherwise. + */ +int rte_vfio_noiommu_is_enabled(void); + +/** + * Remove group fd from internal VFIO group fd array/ + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param vfio_group_fd + * VFIO Grouup FD. + * + * @return + * 0 on success. + * <0 on failure. + */ +int +rte_vfio_clear_group(int vfio_group_fd); + +/** + * Map memory region for use with VFIO. + * + * @note Require at least one device to be attached at the time of + * mapping. DMA maps done via this API will only apply to default + * container and will not apply to any of the containers created + * via rte_vfio_container_create(). + * + * @param vaddr + * Starting virtual address of memory to be mapped. + * + * @param iova + * Starting IOVA address of memory to be mapped. + * + * @param len + * Length of memory segment being mapped. + * + * @return + * 0 if success. + * -1 on error. + */ +int +rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len); + + +/** + * Unmap memory region from VFIO. + * + * @param vaddr + * Starting virtual address of memory to be unmapped. + * + * @param iova + * Starting IOVA address of memory to be unmapped. + * + * @param len + * Length of memory segment being unmapped. + * + * @return + * 0 if success. + * -1 on error. + */ + +int +rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len); +/** + * Parse IOMMU group number for a device + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param iommu_group_num + * iommu group number + * + * @return + * >0 on success + * 0 for non-existent group or VFIO + * <0 for errors + */ +int +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num); + +/** + * Open VFIO container fd or get an existing one + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @return + * > 0 container fd + * < 0 for errors + */ +int +rte_vfio_get_container_fd(void); + +/** + * Open VFIO group fd or get an existing one + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param iommu_group_num + * iommu group number + * + * @return + * > 0 group fd + * < 0 for errors + */ +int +rte_vfio_get_group_fd(int iommu_group_num); + +/** + * Create a new container for device binding. + * + * @note Any newly allocated DPDK memory will not be mapped into these + * containers by default, user needs to manage DMA mappings for + * any container created by this API. + * + * @return + * the container fd if successful + * <0 if failed + */ +int +rte_vfio_container_create(void); + +/** + * Destroy the container, unbind all vfio groups within it. + * + * @param container_fd + * the container fd to destroy + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_destroy(int container_fd); + +/** + * Bind a IOMMU group to a container. + * + * @param container_fd + * the container's fd + * + * @param iommu_group_num + * the iommu group number to bind to container + * + * @return + * group fd if successful + * <0 if failed + */ +int +rte_vfio_container_group_bind(int container_fd, int iommu_group_num); + +/** + * Unbind a IOMMU group from a container. + * + * @param container_fd + * the container fd of container + * + * @param iommu_group_num + * the iommu group number to delete from container + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num); + +/** + * Perform DMA mapping for devices in a container. + * + * @param container_fd + * the specified container fd + * + * @param vaddr + * Starting virtual address of memory to be mapped. + * + * @param iova + * Starting IOVA address of memory to be mapped. + * + * @param len + * Length of memory segment being mapped. + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, + uint64_t iova, uint64_t len); + +/** + * Perform DMA unmapping for devices in a container. + * + * @param container_fd + * the specified container fd + * + * @param vaddr + * Starting virtual address of memory to be unmapped. + * + * @param iova + * Starting IOVA address of memory to be unmapped. + * + * @param len + * Length of memory segment being unmapped. + * + * @return + * 0 if successful + * <0 if failed + */ +int +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, + uint64_t iova, uint64_t len); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VFIO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c new file mode 100644 index 00000000..e0a8ed15 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c @@ -0,0 +1,643 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "malloc_elem.h" +#include "malloc_heap.h" + +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) +{ + void *cur_page, *contig_seg_start, *page_end, *cur_seg_end; + void *data_start, *data_end; + rte_iova_t expected_iova; + struct rte_memseg *ms; + size_t page_sz, cur, max; + + page_sz = (size_t)elem->msl->page_sz; + data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN); + /* segment must start after header and with specified alignment */ + contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); + + /* if we're in IOVA as VA mode, or if we're in legacy mode with + * hugepages, all elements are IOVA-contiguous. + */ + if (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && rte_eal_has_hugepages())) + return RTE_PTR_DIFF(data_end, contig_seg_start); + + cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); + ms = rte_mem_virt2memseg(cur_page, elem->msl); + + /* do first iteration outside the loop */ + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) - + MALLOC_ELEM_TRAILER_LEN; + max = cur; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + + cur_page = RTE_PTR_ADD(cur_page, page_sz); + + while (cur_page < data_end) { + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + + /* reset start of contiguous segment if unexpected iova */ + if (ms->iova != expected_iova) { + /* next contiguous segment must start at specified + * alignment. + */ + contig_seg_start = RTE_PTR_ALIGN(cur_page, align); + /* new segment start may be on a different page, so find + * the page and skip to next iteration to make sure + * we're not blowing past data end. + */ + ms = rte_mem_virt2memseg(contig_seg_start, elem->msl); + cur_page = ms->addr; + /* don't trigger another recalculation */ + expected_iova = ms->iova; + continue; + } + /* cur_seg_end ends on a page boundary or on data end. if we're + * looking at data end, then malloc trailer is already included + * in the calculations. if we're looking at page end, then we + * know there's more data past this page and thus there's space + * for malloc element trailer, so don't count it here. + */ + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start); + /* update max if cur value is bigger */ + if (cur > max) + max = cur; + + /* move to next page */ + cur_page = page_end; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + } + + return max; +} + +/* + * Initialize a general malloc_elem header structure + */ +void +malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap, + struct rte_memseg_list *msl, size_t size) +{ + elem->heap = heap; + elem->msl = msl; + elem->prev = NULL; + elem->next = NULL; + memset(&elem->free_list, 0, sizeof(elem->free_list)); + elem->state = ELEM_FREE; + elem->size = size; + elem->pad = 0; + set_header(elem); + set_trailer(elem); +} + +void +malloc_elem_insert(struct malloc_elem *elem) +{ + struct malloc_elem *prev_elem, *next_elem; + struct malloc_heap *heap = elem->heap; + + /* first and last elements must be both NULL or both non-NULL */ + if ((heap->first == NULL) != (heap->last == NULL)) { + RTE_LOG(ERR, EAL, "Heap is probably corrupt\n"); + return; + } + + if (heap->first == NULL && heap->last == NULL) { + /* if empty heap */ + heap->first = elem; + heap->last = elem; + prev_elem = NULL; + next_elem = NULL; + } else if (elem < heap->first) { + /* if lower than start */ + prev_elem = NULL; + next_elem = heap->first; + heap->first = elem; + } else if (elem > heap->last) { + /* if higher than end */ + prev_elem = heap->last; + next_elem = NULL; + heap->last = elem; + } else { + /* the new memory is somewhere inbetween start and end */ + uint64_t dist_from_start, dist_from_end; + + dist_from_end = RTE_PTR_DIFF(heap->last, elem); + dist_from_start = RTE_PTR_DIFF(elem, heap->first); + + /* check which is closer, and find closest list entries */ + if (dist_from_start < dist_from_end) { + prev_elem = heap->first; + while (prev_elem->next < elem) + prev_elem = prev_elem->next; + next_elem = prev_elem->next; + } else { + next_elem = heap->last; + while (next_elem->prev > elem) + next_elem = next_elem->prev; + prev_elem = next_elem->prev; + } + } + + /* insert new element */ + elem->prev = prev_elem; + elem->next = next_elem; + if (prev_elem) + prev_elem->next = elem; + if (next_elem) + next_elem->prev = elem; +} + +/* + * Attempt to find enough physically contiguous memory in this block to store + * our data. Assume that element has at least enough space to fit in the data, + * so we just check the page addresses. + */ +static bool +elem_check_phys_contig(const struct rte_memseg_list *msl, + void *start, size_t size) +{ + return eal_memalloc_is_contig(msl, start, size); +} + +/* + * calculate the starting point of where data of the requested size + * and alignment would fit in the current element. If the data doesn't + * fit, return NULL. + */ +static void * +elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + size_t elem_size = elem->size; + + /* + * we're allocating from the end, so adjust the size of element by + * alignment size. + */ + while (elem_size >= size) { + const size_t bmask = ~(bound - 1); + uintptr_t end_pt = (uintptr_t)elem + + elem_size - MALLOC_ELEM_TRAILER_LEN; + uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + uintptr_t new_elem_start; + + /* check boundary */ + if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) { + end_pt = RTE_ALIGN_FLOOR(end_pt, bound); + new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + end_pt = new_data_start + size; + + if (((end_pt - 1) & bmask) != (new_data_start & bmask)) + return NULL; + } + + new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN; + + /* if the new start point is before the exist start, + * it won't fit + */ + if (new_elem_start < (uintptr_t)elem) + return NULL; + + if (contig) { + size_t new_data_size = end_pt - new_data_start; + + /* + * if physical contiguousness was requested and we + * couldn't fit all data into one physically contiguous + * block, try again with lower addresses. + */ + if (!elem_check_phys_contig(elem->msl, + (void *)new_data_start, + new_data_size)) { + elem_size -= align; + continue; + } + } + return (void *)new_elem_start; + } + return NULL; +} + +/* + * use elem_start_pt to determine if we get meet the size and + * alignment request from the current element + */ +int +malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + return elem_start_pt(elem, size, align, bound, contig) != NULL; +} + +/* + * split an existing element into two smaller elements at the given + * split_pt parameter. + */ +static void +split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt) +{ + struct malloc_elem *next_elem = elem->next; + const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem; + const size_t new_elem_size = elem->size - old_elem_size; + + malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size); + split_pt->prev = elem; + split_pt->next = next_elem; + if (next_elem) + next_elem->prev = split_pt; + else + elem->heap->last = split_pt; + elem->next = split_pt; + elem->size = old_elem_size; + set_trailer(elem); +} + +/* + * our malloc heap is a doubly linked list, so doubly remove our element. + */ +static void __rte_unused +remove_elem(struct malloc_elem *elem) +{ + struct malloc_elem *next, *prev; + next = elem->next; + prev = elem->prev; + + if (next) + next->prev = prev; + else + elem->heap->last = prev; + if (prev) + prev->next = next; + else + elem->heap->first = next; + + elem->prev = NULL; + elem->next = NULL; +} + +static int +next_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem->next == RTE_PTR_ADD(elem, elem->size); +} + +static int +prev_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem == RTE_PTR_ADD(elem->prev, elem->prev->size); +} + +/* + * Given an element size, compute its freelist index. + * We free an element into the freelist containing similarly-sized elements. + * We try to allocate elements starting with the freelist containing + * similarly-sized elements, and if necessary, we search freelists + * containing larger elements. + * + * Example element size ranges for a heap with five free lists: + * heap->free_head[0] - (0 , 2^8] + * heap->free_head[1] - (2^8 , 2^10] + * heap->free_head[2] - (2^10 ,2^12] + * heap->free_head[3] - (2^12, 2^14] + * heap->free_head[4] - (2^14, MAX_SIZE] + */ +size_t +malloc_elem_free_list_index(size_t size) +{ +#define MALLOC_MINSIZE_LOG2 8 +#define MALLOC_LOG2_INCREMENT 2 + + size_t log2; + size_t index; + + if (size <= (1UL << MALLOC_MINSIZE_LOG2)) + return 0; + + /* Find next power of 2 >= size. */ + log2 = sizeof(size) * 8 - __builtin_clzl(size-1); + + /* Compute freelist index, based on log2(size). */ + index = (log2 - MALLOC_MINSIZE_LOG2 + MALLOC_LOG2_INCREMENT - 1) / + MALLOC_LOG2_INCREMENT; + + return index <= RTE_HEAP_NUM_FREELISTS-1? + index: RTE_HEAP_NUM_FREELISTS-1; +} + +/* + * Add the specified element to its heap's free list. + */ +void +malloc_elem_free_list_insert(struct malloc_elem *elem) +{ + size_t idx; + + idx = malloc_elem_free_list_index(elem->size - MALLOC_ELEM_HEADER_LEN); + elem->state = ELEM_FREE; + LIST_INSERT_HEAD(&elem->heap->free_head[idx], elem, free_list); +} + +/* + * Remove the specified element from its heap's free list. + */ +void +malloc_elem_free_list_remove(struct malloc_elem *elem) +{ + LIST_REMOVE(elem, free_list); +} + +/* + * reserve a block of data in an existing malloc_elem. If the malloc_elem + * is much larger than the data block requested, we split the element in two. + * This function is only called from malloc_heap_alloc so parameter checking + * is not done here, as it's done there previously. + */ +struct malloc_elem * +malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound, + contig); + const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem; + const size_t trailer_size = elem->size - old_elem_size - size - + MALLOC_ELEM_OVERHEAD; + + malloc_elem_free_list_remove(elem); + + if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split it, too much free space after elem */ + struct malloc_elem *new_free_elem = + RTE_PTR_ADD(new_elem, size + MALLOC_ELEM_OVERHEAD); + + split_elem(elem, new_free_elem); + malloc_elem_free_list_insert(new_free_elem); + + if (elem == elem->heap->last) + elem->heap->last = new_free_elem; + } + + if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* don't split it, pad the element instead */ + elem->state = ELEM_BUSY; + elem->pad = old_elem_size; + + /* put a dummy header in padding, to point to real element header */ + if (elem->pad > 0) { /* pad will be at least 64-bytes, as everything + * is cache-line aligned */ + new_elem->pad = elem->pad; + new_elem->state = ELEM_PAD; + new_elem->size = elem->size - elem->pad; + set_header(new_elem); + } + + return new_elem; + } + + /* we are going to split the element in two. The original element + * remains free, and the new element is the one allocated. + * Re-insert original element, in case its new size makes it + * belong on a different list. + */ + split_elem(elem, new_elem); + new_elem->state = ELEM_BUSY; + malloc_elem_free_list_insert(elem); + + return new_elem; +} + +/* + * join two struct malloc_elem together. elem1 and elem2 must + * be contiguous in memory. + */ +static inline void +join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2) +{ + struct malloc_elem *next = elem2->next; + elem1->size += elem2->size; + if (next) + next->prev = elem1; + else + elem1->heap->last = elem1; + elem1->next = next; +} + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem) +{ + /* + * check if next element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->next != NULL && elem->next->state == ELEM_FREE && + next_elem_is_adjacent(elem)) { + void *erase; + size_t erase_len; + + /* we will want to erase the trailer and header */ + erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->next->pad; + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + /* erase header, trailer and pad */ + memset(erase, 0, erase_len); + } + + /* + * check if prev element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->prev != NULL && elem->prev->state == ELEM_FREE && + prev_elem_is_adjacent(elem)) { + struct malloc_elem *new_elem; + void *erase; + size_t erase_len; + + /* we will want to erase trailer and header */ + erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->pad; + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->prev); + + new_elem = elem->prev; + join_elem(new_elem, elem); + + /* erase header, trailer and pad */ + memset(erase, 0, erase_len); + + elem = new_elem; + } + + return elem; +} + +/* + * free a malloc_elem block by adding it to the free list. If the + * blocks either immediately before or immediately after newly freed block + * are also free, the blocks are merged together. + */ +struct malloc_elem * +malloc_elem_free(struct malloc_elem *elem) +{ + void *ptr; + size_t data_len; + + ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_len = elem->size - MALLOC_ELEM_OVERHEAD; + + elem = malloc_elem_join_adjacent_free(elem); + + malloc_elem_free_list_insert(elem); + + elem->pad = 0; + + /* decrease heap's count of allocated elements */ + elem->heap->alloc_count--; + + memset(ptr, 0, data_len); + + return elem; +} + +/* assume all checks were already done */ +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len) +{ + struct malloc_elem *hide_start, *hide_end, *prev, *next; + size_t len_before, len_after; + + hide_start = start; + hide_end = RTE_PTR_ADD(start, len); + + prev = elem->prev; + next = elem->next; + + /* we cannot do anything with non-adjacent elements */ + if (next && next_elem_is_adjacent(elem)) { + len_after = RTE_PTR_DIFF(next, hide_end); + if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split after */ + split_elem(elem, hide_end); + + malloc_elem_free_list_insert(hide_end); + } else if (len_after > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + /* we cannot do anything with non-adjacent elements */ + if (prev && prev_elem_is_adjacent(elem)) { + len_before = RTE_PTR_DIFF(hide_start, elem); + if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split before */ + split_elem(elem, hide_start); + + prev = elem; + elem = hide_start; + + malloc_elem_free_list_insert(prev); + } else if (len_before > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + remove_elem(elem); +} + +/* + * attempt to resize a malloc_elem by expanding into any free space + * immediately after it in memory. + */ +int +malloc_elem_resize(struct malloc_elem *elem, size_t size) +{ + const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD; + + /* if we request a smaller size, then always return ok */ + if (elem->size >= new_size) + return 0; + + /* check if there is a next element, it's free and adjacent */ + if (!elem->next || elem->next->state != ELEM_FREE || + !next_elem_is_adjacent(elem)) + return -1; + if (elem->size + elem->next->size < new_size) + return -1; + + /* we now know the element fits, so remove from free list, + * join the two + */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) { + /* now we have a big block together. Lets cut it down a bit, by splitting */ + struct malloc_elem *split_pt = RTE_PTR_ADD(elem, new_size); + split_pt = RTE_PTR_ALIGN_CEIL(split_pt, RTE_CACHE_LINE_SIZE); + split_elem(elem, split_pt); + malloc_elem_free_list_insert(split_pt); + } + return 0; +} + +static inline const char * +elem_state_to_str(enum elem_state state) +{ + switch (state) { + case ELEM_PAD: + return "PAD"; + case ELEM_BUSY: + return "BUSY"; + case ELEM_FREE: + return "FREE"; + } + return "ERROR"; +} + +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f) +{ + fprintf(f, "Malloc element at %p (%s)\n", elem, + elem_state_to_str(elem->state)); + fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad); + fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h new file mode 100644 index 00000000..e2bda4c0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef MALLOC_ELEM_H_ +#define MALLOC_ELEM_H_ + +#include + +#include + +#define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE) + +/* dummy definition of struct so we can use pointers to it in malloc_elem struct */ +struct malloc_heap; + +enum elem_state { + ELEM_FREE = 0, + ELEM_BUSY, + ELEM_PAD /* element is a padding-only header */ +}; + +struct malloc_elem { + struct malloc_heap *heap; + struct malloc_elem *volatile prev; + /**< points to prev elem in memseg */ + struct malloc_elem *volatile next; + /**< points to next elem in memseg */ + LIST_ENTRY(malloc_elem) free_list; + /**< list of free elements in heap */ + struct rte_memseg_list *msl; + volatile enum elem_state state; + uint32_t pad; + size_t size; +#ifdef RTE_MALLOC_DEBUG + uint64_t header_cookie; /* Cookie marking start of data */ + /* trailer cookie at start + size */ +#endif +} __rte_cache_aligned; + +#ifndef RTE_MALLOC_DEBUG +static const unsigned MALLOC_ELEM_TRAILER_LEN = 0; + +/* dummy function - just check if pointer is non-null */ +static inline int +malloc_elem_cookies_ok(const struct malloc_elem *elem){ return elem != NULL; } + +/* dummy function - no header if malloc_debug is not enabled */ +static inline void +set_header(struct malloc_elem *elem __rte_unused){ } + +/* dummy function - no trailer if malloc_debug is not enabled */ +static inline void +set_trailer(struct malloc_elem *elem __rte_unused){ } + + +#else +static const unsigned MALLOC_ELEM_TRAILER_LEN = RTE_CACHE_LINE_SIZE; + +#define MALLOC_HEADER_COOKIE 0xbadbadbadadd2e55ULL /**< Header cookie. */ +#define MALLOC_TRAILER_COOKIE 0xadd2e55badbadbadULL /**< Trailer cookie.*/ + +/* define macros to make referencing the header and trailer cookies easier */ +#define MALLOC_ELEM_TRAILER(elem) (*((uint64_t*)RTE_PTR_ADD(elem, \ + elem->size - MALLOC_ELEM_TRAILER_LEN))) +#define MALLOC_ELEM_HEADER(elem) (elem->header_cookie) + +static inline void +set_header(struct malloc_elem *elem) +{ + if (elem != NULL) + MALLOC_ELEM_HEADER(elem) = MALLOC_HEADER_COOKIE; +} + +static inline void +set_trailer(struct malloc_elem *elem) +{ + if (elem != NULL) + MALLOC_ELEM_TRAILER(elem) = MALLOC_TRAILER_COOKIE; +} + +/* check that the header and trailer cookies are set correctly */ +static inline int +malloc_elem_cookies_ok(const struct malloc_elem *elem) +{ + return elem != NULL && + MALLOC_ELEM_HEADER(elem) == MALLOC_HEADER_COOKIE && + MALLOC_ELEM_TRAILER(elem) == MALLOC_TRAILER_COOKIE; +} + +#endif + +static const unsigned MALLOC_ELEM_HEADER_LEN = sizeof(struct malloc_elem); +#define MALLOC_ELEM_OVERHEAD (MALLOC_ELEM_HEADER_LEN + MALLOC_ELEM_TRAILER_LEN) + +/* + * Given a pointer to the start of a memory block returned by malloc, get + * the actual malloc_elem header for that block. + */ +static inline struct malloc_elem * +malloc_elem_from_data(const void *data) +{ + if (data == NULL) + return NULL; + + struct malloc_elem *elem = RTE_PTR_SUB(data, MALLOC_ELEM_HEADER_LEN); + if (!malloc_elem_cookies_ok(elem)) + return NULL; + return elem->state != ELEM_PAD ? elem: RTE_PTR_SUB(elem, elem->pad); +} + +/* + * initialise a malloc_elem header + */ +void +malloc_elem_init(struct malloc_elem *elem, + struct malloc_heap *heap, + struct rte_memseg_list *msl, + size_t size); + +void +malloc_elem_insert(struct malloc_elem *elem); + +/* + * return true if the current malloc_elem can hold a block of data + * of the requested size and with the requested alignment + */ +int +malloc_elem_can_hold(struct malloc_elem *elem, size_t size, + unsigned int align, size_t bound, bool contig); + +/* + * reserve a block of data in an existing malloc_elem. If the malloc_elem + * is much larger than the data block requested, we split the element in two. + */ +struct malloc_elem * +malloc_elem_alloc(struct malloc_elem *elem, size_t size, + unsigned int align, size_t bound, bool contig); + +/* + * free a malloc_elem block by adding it to the free list. If the + * blocks either immediately before or immediately after newly freed block + * are also free, the blocks are merged together. + */ +struct malloc_elem * +malloc_elem_free(struct malloc_elem *elem); + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem); + +/* + * attempt to resize a malloc_elem by expanding into any free space + * immediately after it in memory. + */ +int +malloc_elem_resize(struct malloc_elem *elem, size_t size); + +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len); + +void +malloc_elem_free_list_remove(struct malloc_elem *elem); + +/* + * dump contents of malloc elem to a file. + */ +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f); + +/* + * Given an element size, compute its freelist index. + */ +size_t +malloc_elem_free_list_index(size_t size); + +/* + * Add element to its heap's free list. + */ +void +malloc_elem_free_list_insert(struct malloc_elem *elem); + +/* + * Find biggest IOVA-contiguous zone within an element with specified alignment. + */ +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align); + +#endif /* MALLOC_ELEM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c new file mode 100644 index 00000000..02a5385d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c @@ -0,0 +1,1001 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "malloc_elem.h" +#include "malloc_heap.h" +#include "malloc_mp.h" + +static unsigned +check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) +{ + unsigned check_flag = 0; + + if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) + return 1; + + switch (hugepage_sz) { + case RTE_PGSIZE_256K: + check_flag = RTE_MEMZONE_256KB; + break; + case RTE_PGSIZE_2M: + check_flag = RTE_MEMZONE_2MB; + break; + case RTE_PGSIZE_16M: + check_flag = RTE_MEMZONE_16MB; + break; + case RTE_PGSIZE_256M: + check_flag = RTE_MEMZONE_256MB; + break; + case RTE_PGSIZE_512M: + check_flag = RTE_MEMZONE_512MB; + break; + case RTE_PGSIZE_1G: + check_flag = RTE_MEMZONE_1GB; + break; + case RTE_PGSIZE_4G: + check_flag = RTE_MEMZONE_4GB; + break; + case RTE_PGSIZE_16G: + check_flag = RTE_MEMZONE_16GB; + } + + return check_flag & flags; +} + +/* + * Expand the heap with a memory area. + */ +static struct malloc_elem * +malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, + void *start, size_t len) +{ + struct malloc_elem *elem = start; + + malloc_elem_init(elem, heap, msl, len); + + malloc_elem_insert(elem); + + elem = malloc_elem_join_adjacent_free(elem); + + malloc_elem_free_list_insert(elem); + + return elem; +} + +static int +malloc_add_seg(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct malloc_heap *heap; + int msl_idx; + + heap = &mcfg->malloc_heaps[msl->socket_id]; + + /* msl is const, so find it */ + msl_idx = msl - mcfg->memsegs; + + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + found_msl = &mcfg->memsegs[msl_idx]; + + malloc_heap_add_memory(heap, found_msl, ms->addr, len); + + heap->total_size += len; + + RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, + msl->socket_id); + return 0; +} + +/* + * Iterates through the freelist for a heap to find a free element + * which can store data of the required size and with the requested alignment. + * If size is 0, find the biggest available elem. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_suitable_element(struct malloc_heap *heap, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + size_t idx; + struct malloc_elem *elem, *alt_elem = NULL; + + for (idx = malloc_elem_free_list_index(size); + idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + if (malloc_elem_can_hold(elem, size, align, bound, + contig)) { + if (check_hugepage_sz(flags, + elem->msl->page_sz)) + return elem; + if (alt_elem == NULL) + alt_elem = elem; + } + } + } + + if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) + return alt_elem; + + return NULL; +} + +/* + * Iterates through the freelist for a heap to find a free element with the + * biggest size and requested alignment. Will also set size to whatever element + * size that was found. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_biggest_element(struct malloc_heap *heap, size_t *size, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem, *max_elem = NULL; + size_t idx, max_size = 0; + + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + size_t cur_size; + if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && + !check_hugepage_sz(flags, + elem->msl->page_sz)) + continue; + if (contig) { + cur_size = + malloc_elem_find_max_iova_contig(elem, + align); + } else { + void *data_start = RTE_PTR_ADD(elem, + MALLOC_ELEM_HEADER_LEN); + void *data_end = RTE_PTR_ADD(elem, elem->size - + MALLOC_ELEM_TRAILER_LEN); + void *aligned = RTE_PTR_ALIGN_CEIL(data_start, + align); + /* check if aligned data start is beyond end */ + if (aligned >= data_end) + continue; + cur_size = RTE_PTR_DIFF(data_end, aligned); + } + if (cur_size > max_size) { + max_size = cur_size; + max_elem = elem; + } + } + } + + *size = max_size; + return max_elem; +} + +/* + * Main function to allocate a block of memory from the heap. + * It locks the free list, scans it, and adds a new memseg if the + * scan fails. Once the new memseg is added, it re-scans and should return + * the new element after releasing the lock. + */ +static void * +heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct malloc_elem *elem; + + size = RTE_CACHE_LINE_ROUNDUP(size); + align = RTE_CACHE_LINE_ROUNDUP(align); + + elem = find_suitable_element(heap, size, flags, align, bound, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, bound, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + +static void * +heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem; + size_t size; + + align = RTE_CACHE_LINE_ROUNDUP(align); + + elem = find_biggest_element(heap, &size, flags, align, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, 0, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + +/* this function is exposed in malloc_mp.h */ +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len) +{ + if (elem != NULL) { + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, map_addr, map_len); + } + + eal_memalloc_free_seg_bulk(ms, n_segs); +} + +/* this function is exposed in malloc_mp.h */ +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs) +{ + struct rte_memseg_list *msl; + struct malloc_elem *elem = NULL; + size_t alloc_sz; + int allocd_pages; + void *ret, *map_addr; + + alloc_sz = (size_t)pg_sz * n_segs; + + /* first, check if we're allowed to allocate this memory */ + if (eal_memalloc_mem_alloc_validate(socket, + heap->total_size + alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); + return NULL; + } + + allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, + socket, true); + + /* make sure we've allocated our pages... */ + if (allocd_pages < 0) + return NULL; + + map_addr = ms[0]->addr; + msl = rte_mem_virt2memseg_list(map_addr); + + /* check if we wanted contiguous memory but didn't get it */ + if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", + __func__); + goto fail; + } + + /* add newly minted memsegs to malloc heap */ + elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); + + /* try once more, as now we have allocated new memory */ + ret = find_suitable_element(heap, elt_size, flags, align, bound, + contig); + + if (ret == NULL) + goto fail; + + return elem; + +fail: + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + return NULL; +} + +static int +try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_elem *elem; + struct rte_memseg **ms; + void *map_addr; + size_t alloc_sz; + int n_segs; + bool callback_triggered = false; + + alloc_sz = RTE_ALIGN_CEIL(align + elt_size + + MALLOC_ELEM_TRAILER_LEN, pg_sz); + n_segs = alloc_sz / pg_sz; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + + memset(ms, 0, sizeof(*ms) * n_segs); + + if (ms == NULL) + return -1; + + elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, + bound, contig, ms, n_segs); + + if (elem == NULL) + goto free_ms; + + map_addr = ms[0]->addr; + + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); + + /* notify other processes that this has happened */ + if (request_sync()) { + /* we couldn't ensure all processes have mapped memory, + * so free it back and notify everyone that it's been + * freed back. + * + * technically, we could've avoided adding memory addresses to + * the map, but that would've led to inconsistent behavior + * between primary and secondary processes, as those get + * callbacks during sync. therefore, force primary process to + * do alloc-and-rollback syncs as well. + */ + callback_triggered = true; + goto free_elem; + } + heap->total_size += alloc_sz; + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", + socket, alloc_sz >> 20ULL); + + free(ms); + + return 0; + +free_elem: + if (callback_triggered) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + map_addr, alloc_sz); + + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + + request_sync(); +free_ms: + free(ms); + + return -1; +} + +static int +try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_mp_req req; + int req_result; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_ALLOC; + req.alloc_req.align = align; + req.alloc_req.bound = bound; + req.alloc_req.contig = contig; + req.alloc_req.flags = flags; + req.alloc_req.elt_size = elt_size; + req.alloc_req.page_sz = pg_sz; + req.alloc_req.socket = socket; + req.alloc_req.heap = heap; /* it's in shared memory */ + + req_result = request_to_primary(&req); + + if (req_result != 0) + return -1; + + if (req.result != REQ_RESULT_SUCCESS) + return -1; + + return 0; +} + +static int +try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } else { + ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } + + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +static int +compare_pagesz(const void *a, const void *b) +{ + const struct rte_memseg_list * const*mpa = a; + const struct rte_memseg_list * const*mpb = b; + const struct rte_memseg_list *msla = *mpa; + const struct rte_memseg_list *mslb = *mpb; + uint64_t pg_sz_a = msla->page_sz; + uint64_t pg_sz_b = mslb->page_sz; + + if (pg_sz_a < pg_sz_b) + return -1; + if (pg_sz_a > pg_sz_b) + return 1; + return 0; +} + +static int +alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; + struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; + uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t prev_pg_sz; + int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; + bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + void *ret; + + memset(requested_msls, 0, sizeof(requested_msls)); + memset(other_msls, 0, sizeof(other_msls)); + memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); + memset(other_pg_sz, 0, sizeof(other_pg_sz)); + + /* + * go through memseg list and take note of all the page sizes available, + * and if any of them were specifically requested by the user. + */ + n_requested_msls = 0; + n_other_msls = 0; + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->socket_id != socket) + continue; + + if (msl->base_va == NULL) + continue; + + /* if pages of specific size were requested */ + if (size_flags != 0 && check_hugepage_sz(size_flags, + msl->page_sz)) + requested_msls[n_requested_msls++] = msl; + else if (size_flags == 0 || size_hint) + other_msls[n_other_msls++] = msl; + } + + /* sort the lists, smallest first */ + qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), + compare_pagesz); + qsort(other_msls, n_other_msls, sizeof(other_msls[0]), + compare_pagesz); + + /* now, extract page sizes we are supposed to try */ + prev_pg_sz = 0; + n_requested_pg_sz = 0; + for (i = 0; i < n_requested_msls; i++) { + uint64_t pg_sz = requested_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + requested_pg_sz[n_requested_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + prev_pg_sz = 0; + n_other_pg_sz = 0; + for (i = 0; i < n_other_msls; i++) { + uint64_t pg_sz = other_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + other_pg_sz[n_other_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + + /* finally, try allocating memory of specified page sizes, starting from + * the smallest sizes + */ + for (i = 0; i < n_requested_pg_sz; i++) { + uint64_t pg_sz = requested_pg_sz[i]; + + /* + * do not pass the size hint here, as user expects other page + * sizes first, before resorting to best effort allocation. + */ + if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, + align, bound, contig)) + return 0; + } + if (n_other_pg_sz == 0) + return -1; + + /* now, check if we can reserve anything with size hint */ + ret = find_suitable_element(heap, size, flags, align, bound, contig); + if (ret != NULL) + return 0; + + /* + * we still couldn't reserve memory, so try expanding heap with other + * page sizes, if there are any + */ + for (i = 0; i < n_other_pg_sz; i++) { + uint64_t pg_sz = other_pg_sz[i]; + + if (!try_expand_heap(heap, pg_sz, size, socket, flags, + align, bound, contig)) + return 0; + } + return -1; +} + +/* this will try lower page sizes first */ +static void * +heap_alloc_on_socket(const char *type, size_t size, int socket, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + /* for legacy mode, try once and with all flags */ + if (internal_config.legacy_mem) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + goto alloc_unlock; + } + + /* + * we do not pass the size hint here, because even if allocation fails, + * we may still be able to allocate memory from appropriate page sizes, + * we just need to request more memory first. + */ + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); + if (ret != NULL) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound, + contig)) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + + /* this should have succeeded */ + if (ret == NULL) + RTE_LOG(ERR, EAL, "Error allocating from heap\n"); + } +alloc_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket_arg, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + int socket, i, cur_socket; + void *ret; + + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* Check socket parameter */ + if (socket >= RTE_MAX_NUMA_NODES) + return NULL; + + ret = heap_alloc_on_socket(type, size, socket, flags, align, bound, + contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps */ + for (i = 0; i < (int) rte_socket_count(); i++) { + cur_socket = rte_socket_id_by_idx(i); + if (cur_socket == socket) + continue; + ret = heap_alloc_on_socket(type, size, cur_socket, flags, + align, bound, contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +static void * +heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags, + size_t align, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + ret = heap_alloc_biggest(heap, type, flags, align, contig); + + rte_spinlock_unlock(&(heap->lock)); + + return ret; +} + +void * +malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, + size_t align, bool contig) +{ + int socket, i, cur_socket; + void *ret; + + /* return NULL if align is not power-of-2 */ + if ((align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* Check socket parameter */ + if (socket >= RTE_MAX_NUMA_NODES) + return NULL; + + ret = heap_alloc_biggest_on_socket(type, socket, flags, align, + contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps */ + for (i = 0; i < (int) rte_socket_count(); i++) { + cur_socket = rte_socket_id_by_idx(i); + if (cur_socket == socket) + continue; + ret = heap_alloc_biggest_on_socket(type, cur_socket, flags, + align, contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +/* this function is exposed in malloc_mp.h */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len) +{ + int n_segs, seg_idx, max_seg_idx; + struct rte_memseg_list *msl; + size_t page_sz; + + msl = rte_mem_virt2memseg_list(aligned_start); + if (msl == NULL) + return -1; + + page_sz = (size_t)msl->page_sz; + n_segs = aligned_len / page_sz; + seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; + max_seg_idx = seg_idx + n_segs; + + for (; seg_idx < max_seg_idx; seg_idx++) { + struct rte_memseg *ms; + + ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); + eal_memalloc_free_seg(ms); + } + return 0; +} + +int +malloc_heap_free(struct malloc_elem *elem) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap; + void *start, *aligned_start, *end, *aligned_end; + size_t len, aligned_len, page_sz; + struct rte_memseg_list *msl; + unsigned int i, n_segs, before_space, after_space; + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + /* elem may be merged with previous element, so keep heap address */ + heap = elem->heap; + msl = elem->msl; + page_sz = (size_t)msl->page_sz; + + rte_spinlock_lock(&(heap->lock)); + + /* mark element as free */ + elem->state = ELEM_FREE; + + elem = malloc_elem_free(elem); + + /* anything after this is a bonus */ + ret = 0; + + /* ...of which we can't avail if we are in legacy mode */ + if (internal_config.legacy_mem) + goto free_unlock; + + /* check if we can free any memory back to the system */ + if (elem->size < page_sz) + goto free_unlock; + + /* probably, but let's make sure, as we may not be using up full page */ + start = elem; + len = elem->size; + aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); + end = RTE_PTR_ADD(elem, len); + aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); + + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + + /* can't free anything */ + if (aligned_len < page_sz) + goto free_unlock; + + /* we can free something. however, some of these pages may be marked as + * unfreeable, so also check that as well + */ + n_segs = aligned_len / page_sz; + for (i = 0; i < n_segs; i++) { + const struct rte_memseg *tmp = + rte_mem_virt2memseg(aligned_start, msl); + + if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + /* this is an unfreeable segment, so move start */ + aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); + } + } + + /* recalculate length and number of segments */ + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + n_segs = aligned_len / page_sz; + + /* check if we can still free some pages */ + if (n_segs == 0) + goto free_unlock; + + /* We're not done yet. We also have to check if by freeing space we will + * be leaving free elements that are too small to store new elements. + * Check if we have enough space in the beginning and at the end, or if + * start/end are exactly page aligned. + */ + before_space = RTE_PTR_DIFF(aligned_start, elem); + after_space = RTE_PTR_DIFF(end, aligned_end); + if (before_space != 0 && + before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space before start, but we may be able to + * move the start forward by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move start */ + aligned_start = RTE_PTR_ADD(aligned_start, page_sz); + aligned_len -= page_sz; + n_segs--; + } + if (after_space != 0 && after_space < + MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space after end, but we may be able to + * move the end backwards by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move end */ + aligned_end = RTE_PTR_SUB(aligned_end, page_sz); + aligned_len -= page_sz; + n_segs--; + } + + /* now we can finally free us some pages */ + + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* + * we allow secondary processes to clear the heap of this allocated + * memory because it is safe to do so, as even if notifications about + * unmapped pages don't make it to other processes, heap is shared + * across all processes, and will become empty of this memory anyway, + * and nothing can allocate it back unless primary process will be able + * to deliver allocation message to every single running process. + */ + + malloc_elem_free_list_remove(elem); + + malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); + + heap->total_size -= aligned_len; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + aligned_start, aligned_len); + + /* don't care if any of this fails */ + malloc_heap_free_pages(aligned_start, aligned_len); + + request_sync(); + } else { + struct malloc_mp_req req; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_FREE; + req.free_req.addr = aligned_start; + req.free_req.len = aligned_len; + + /* + * we request primary to deallocate pages, but we don't do it + * in this thread. instead, we notify primary that we would like + * to deallocate pages, and this process will receive another + * request (in parallel) that will do it for us on another + * thread. + * + * we also don't really care if this succeeds - the data is + * already removed from the heap, so it is, for all intents and + * purposes, hidden from the rest of DPDK even if some other + * process (including this one) may have these pages mapped. + * + * notifications about deallocated memory happen during sync. + */ + request_to_primary(&req); + } + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", + msl->socket_id, aligned_len >> 20ULL); + + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); +free_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size) +{ + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + rte_spinlock_lock(&(elem->heap->lock)); + + ret = malloc_elem_resize(elem, size); + + rte_spinlock_unlock(&(elem->heap->lock)); + + return ret; +} + +/* + * Function to retrieve data for heap on given socket + */ +int +malloc_heap_get_stats(struct malloc_heap *heap, + struct rte_malloc_socket_stats *socket_stats) +{ + size_t idx; + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + /* Initialise variables for heap */ + socket_stats->free_count = 0; + socket_stats->heap_freesz_bytes = 0; + socket_stats->greatest_free_size = 0; + + /* Iterate through free list */ + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) + { + socket_stats->free_count++; + socket_stats->heap_freesz_bytes += elem->size; + if (elem->size > socket_stats->greatest_free_size) + socket_stats->greatest_free_size = elem->size; + } + } + /* Get stats on overall heap and allocated memory on this heap */ + socket_stats->heap_totalsz_bytes = heap->total_size; + socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - + socket_stats->heap_freesz_bytes); + socket_stats->alloc_count = heap->alloc_count; + + rte_spinlock_unlock(&heap->lock); + return 0; +} + +/* + * Function to retrieve data for heap on given socket + */ +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f) +{ + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + fprintf(f, "Heap size: 0x%zx\n", heap->total_size); + fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); + + elem = heap->first; + while (elem) { + malloc_elem_dump(elem, f); + elem = elem->next; + } + + rte_spinlock_unlock(&heap->lock); +} + +int +rte_eal_malloc_heap_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (register_mp_requests()) { + RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return -1; + } + + /* unlock mem hotplug here. it's safe for primary as no requests can + * even come before primary itself is fully initialized, and secondaries + * do not need to initialize the heap. + */ + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + /* secondary process does not need to initialize anything */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + /* add all IOVA-contiguous areas to the heap */ + return rte_memseg_contig_walk(malloc_add_seg, NULL); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h new file mode 100644 index 00000000..f52cb555 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef MALLOC_HEAP_H_ +#define MALLOC_HEAP_H_ + +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline unsigned +malloc_get_numa_socket(void) +{ + unsigned socket_id = rte_socket_id(); + + if (socket_id == (unsigned)SOCKET_ID_ANY) + return 0; + + return socket_id; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags, + size_t align, size_t bound, bool contig); + +void * +malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, + size_t align, bool contig); + +int +malloc_heap_free(struct malloc_elem *elem); + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size); + +int +malloc_heap_get_stats(struct malloc_heap *heap, + struct rte_malloc_socket_stats *socket_stats); + +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f); + +int +rte_eal_malloc_heap_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* MALLOC_HEAP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c new file mode 100644 index 00000000..931c14bc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c @@ -0,0 +1,743 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include + +#include "eal_memalloc.h" + +#include "malloc_elem.h" +#include "malloc_mp.h" + +#define MP_ACTION_SYNC "mp_malloc_sync" +/**< request sent by primary process to notify of changes in memory map */ +#define MP_ACTION_ROLLBACK "mp_malloc_rollback" +/**< request sent by primary process to notify of changes in memory map. this is + * essentially a regular sync request, but we cannot send sync requests while + * another one is in progress, and we might have to - therefore, we do this as + * a separate callback. + */ +#define MP_ACTION_REQUEST "mp_malloc_request" +/**< request sent by secondary process to ask for allocation/deallocation */ +#define MP_ACTION_RESPONSE "mp_malloc_response" +/**< response sent to secondary process to indicate result of request */ + +/* forward declarations */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +/* when we're allocating, we need to store some state to ensure that we can + * roll back later + */ +struct primary_alloc_req_state { + struct malloc_heap *heap; + struct rte_memseg **ms; + int ms_len; + struct malloc_elem *elem; + void *map_addr; + size_t map_len; +}; + +enum req_state { + REQ_STATE_INACTIVE = 0, + REQ_STATE_ACTIVE, + REQ_STATE_COMPLETE +}; + +struct mp_request { + TAILQ_ENTRY(mp_request) next; + struct malloc_mp_req user_req; /**< contents of request */ + pthread_cond_t cond; /**< variable we use to time out on this request */ + enum req_state state; /**< indicate status of this request */ + struct primary_alloc_req_state alloc_state; +}; + +/* + * We could've used just a single request, but it may be possible for + * secondaries to timeout earlier than the primary, and send a new request while + * primary is still expecting replies to the old one. Therefore, each new + * request will get assigned a new ID, which is how we will distinguish between + * expected and unexpected messages. + */ +TAILQ_HEAD(mp_request_list, mp_request); +static struct { + struct mp_request_list list; + pthread_mutex_t lock; +} mp_request_list = { + .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list), + .lock = PTHREAD_MUTEX_INITIALIZER +}; + +/** + * General workflow is the following: + * + * Allocation: + * S: send request to primary + * P: attempt to allocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * if success, sendmsg success + * if failure, roll back allocation and send a rollback request + * S: if received msg of success, quit + * if received rollback request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * Aside from timeouts, there are three points where we can quit: + * - if allocation failed straight away + * - if allocation and sync request succeeded + * - if allocation succeeded, sync request failed, allocation rolled back and + * rollback request received (irrespective of whether it succeeded or failed) + * + * Deallocation: + * S: send request to primary + * P: attempt to deallocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * There is no "rollback" from deallocation, as it's safe to have some memory + * mapped in some processes - it's absent from the heap, so it won't get used. + */ + +static struct mp_request * +find_request_by_id(uint64_t id) +{ + struct mp_request *req; + TAILQ_FOREACH(req, &mp_request_list.list, next) { + if (req->user_req.id == id) + break; + } + return req; +} + +/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */ +static uint64_t +get_unique_id(void) +{ + uint64_t id; + do { + id = rte_rand(); + } while (find_request_by_id(id) != NULL); + return id; +} + +/* secondary will respond to sync requests thusly */ +static int +handle_sync(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg reply; + const struct malloc_mp_req *req = + (const struct malloc_mp_req *)msg->param; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.param; + int ret; + + if (req->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected request from primary\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + reply.num_fds = 0; + strlcpy(reply.name, msg->name, sizeof(reply.name)); + reply.len_param = sizeof(*resp); + + ret = eal_memalloc_sync_with_primary(); + + resp->t = REQ_TYPE_SYNC; + resp->id = req->id; + resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL; + + rte_mp_reply(&reply, peer); + + return 0; +} + +static int +handle_alloc_request(const struct malloc_mp_req *m, + struct mp_request *req) +{ + const struct malloc_req_alloc *ar = &m->alloc_req; + struct malloc_heap *heap; + struct malloc_elem *elem; + struct rte_memseg **ms; + size_t alloc_sz; + int n_segs; + void *map_addr; + + alloc_sz = RTE_ALIGN_CEIL(ar->align + ar->elt_size + + MALLOC_ELEM_TRAILER_LEN, ar->page_sz); + n_segs = alloc_sz / ar->page_sz; + + heap = ar->heap; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + + memset(ms, 0, sizeof(*ms) * n_segs); + + if (ms == NULL) { + RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); + goto fail; + } + + elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, + ar->flags, ar->align, ar->bound, ar->contig, ms, + n_segs); + + if (elem == NULL) + goto fail; + + map_addr = ms[0]->addr; + + /* we have succeeded in allocating memory, but we still need to sync + * with other processes. however, since DPDK IPC is single-threaded, we + * send an asynchronous request and exit this callback. + */ + + req->alloc_state.ms = ms; + req->alloc_state.ms_len = n_segs; + req->alloc_state.map_addr = map_addr; + req->alloc_state.map_len = alloc_sz; + req->alloc_state.elem = elem; + req->alloc_state.heap = heap; + + return 0; +fail: + free(ms); + return -1; +} + +/* first stage of primary handling requests from secondary */ +static int +handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + int ret; + + /* lock access to request */ + pthread_mutex_lock(&mp_request_list.lock); + + /* make sure it's not a dupe */ + entry = find_request_by_id(m->id); + if (entry != NULL) { + RTE_LOG(ERR, EAL, "Duplicate request id\n"); + goto fail; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n"); + goto fail; + } + + /* erase all data */ + memset(entry, 0, sizeof(*entry)); + + if (m->t == REQ_TYPE_ALLOC) { + ret = handle_alloc_request(m, entry); + } else if (m->t == REQ_TYPE_FREE) { + ret = malloc_heap_free_pages(m->free_req.addr, + m->free_req.len); + } else { + RTE_LOG(ERR, EAL, "Unexpected request from secondary\n"); + goto fail; + } + + if (ret != 0) { + struct rte_mp_msg resp_msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)resp_msg.param; + + /* send failure message straight away */ + resp_msg.num_fds = 0; + resp_msg.len_param = sizeof(*resp); + strlcpy(resp_msg.name, MP_ACTION_RESPONSE, + sizeof(resp_msg.name)); + + resp->t = m->t; + resp->result = REQ_RESULT_FAIL; + resp->id = m->id; + + if (rte_mp_sendmsg(&resp_msg)) { + RTE_LOG(ERR, EAL, "Couldn't send response\n"); + goto fail; + } + /* we did not modify the request */ + free(entry); + } else { + struct rte_mp_msg sr_msg; + struct malloc_mp_req *sr = + (struct malloc_mp_req *)sr_msg.param; + struct timespec ts; + + memset(&sr_msg, 0, sizeof(sr_msg)); + + /* we can do something, so send sync request asynchronously */ + sr_msg.num_fds = 0; + sr_msg.len_param = sizeof(*sr); + strlcpy(sr_msg.name, MP_ACTION_SYNC, sizeof(sr_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + sr->t = REQ_TYPE_SYNC; + sr->id = m->id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&sr_msg, &ts, + handle_sync_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't send sync request\n"); + if (m->t == REQ_TYPE_ALLOC) + free(entry->alloc_state.ms); + goto fail; + } + + /* mark request as in progress */ + memcpy(&entry->user_req, m, sizeof(*m)); + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + } + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +/* callback for asynchronous sync requests for primary. this will either do a + * sendmsg with results, or trigger rollback request. + */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply) +{ + enum malloc_req_result result; + struct mp_request *entry; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + int i; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + result = REQ_RESULT_SUCCESS; + + if (reply->nb_received != reply->nb_sent) + result = REQ_RESULT_FAIL; + + for (i = 0; i < reply->nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply->msgs[i].param; + + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response to sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->id != entry->user_req.id) { + RTE_LOG(ERR, EAL, "Response to wrong sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->result == REQ_RESULT_FAIL) { + result = REQ_RESULT_FAIL; + break; + } + } + + if (entry->user_req.t == REQ_TYPE_FREE) { + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + /* this is a free request, just sendmsg result */ + resp->t = REQ_TYPE_FREE; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_SUCCESS) { + struct malloc_heap *heap = entry->alloc_state.heap; + struct rte_mp_msg msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + heap->total_size += entry->alloc_state.map_len; + + /* result is success, so just notify secondary about this */ + resp->t = REQ_TYPE_ALLOC; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_FAIL) { + struct rte_mp_msg rb_msg; + struct malloc_mp_req *rb = + (struct malloc_mp_req *)rb_msg.param; + struct timespec ts; + struct primary_alloc_req_state *state = + &entry->alloc_state; + int ret; + + memset(&rb_msg, 0, sizeof(rb_msg)); + + /* we've failed to sync, so do a rollback */ + rollback_expand_heap(state->ms, state->ms_len, state->elem, + state->map_addr, state->map_len); + + /* send rollback request */ + rb_msg.num_fds = 0; + rb_msg.len_param = sizeof(*rb); + strlcpy(rb_msg.name, MP_ACTION_ROLLBACK, sizeof(rb_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + rb->t = REQ_TYPE_SYNC; + rb->id = entry->user_req.id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&rb_msg, &ts, + handle_rollback_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n"); + + /* we couldn't send rollback request, but that's OK - + * secondary will time out, and memory has been removed + * from heap anyway. + */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(state->ms); + free(entry); + goto fail; + } + } else { + RTE_LOG(ERR, EAL, " to sync request of unknown type\n"); + goto fail; + } + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply __rte_unused) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + struct mp_request *entry; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + memset(&msg, 0, sizeof(0)); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + if (entry->user_req.t != REQ_TYPE_ALLOC) { + RTE_LOG(ERR, EAL, "Unexpected active request\n"); + goto fail; + } + + /* we don't care if rollback succeeded, request still failed */ + resp->t = REQ_TYPE_ALLOC; + resp->result = REQ_RESULT_FAIL; + resp->id = mpreq->id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + /* clean up */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +/* final stage of the request from secondary */ +static int +handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(m->id); + if (entry != NULL) { + /* update request status */ + entry->user_req.result = m->result; + + entry->state = REQ_STATE_COMPLETE; + + /* trigger thread wakeup */ + pthread_cond_signal(&entry->cond); + } + + pthread_mutex_unlock(&mp_request_list.lock); + + return 0; +} + +/* synchronously request memory map sync, this is only called whenever primary + * process initiates the allocation. + */ +int +request_sync(void) +{ + struct rte_mp_msg msg; + struct rte_mp_reply reply; + struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param; + struct timespec ts; + int i, ret; + + memset(&msg, 0, sizeof(msg)); + memset(&reply, 0, sizeof(reply)); + + /* no need to create tailq entries as this is entirely synchronous */ + + msg.num_fds = 0; + msg.len_param = sizeof(*req); + strlcpy(msg.name, MP_ACTION_SYNC, sizeof(msg.name)); + + /* sync request carries no data */ + req->t = REQ_TYPE_SYNC; + req->id = get_unique_id(); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_sync(&msg, &reply, &ts); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n"); + ret = -1; + goto out; + } + + if (reply.nb_received != reply.nb_sent) { + RTE_LOG(ERR, EAL, "Not all secondaries have responded\n"); + ret = -1; + goto out; + } + + for (i = 0; i < reply.nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.msgs[i].param; + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response from secondary\n"); + ret = -1; + goto out; + } + if (resp->id != req->id) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + ret = -1; + goto out; + } + if (resp->result != REQ_RESULT_SUCCESS) { + RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n"); + ret = -1; + goto out; + } + } + + ret = 0; +out: + free(reply.msgs); + return ret; +} + +/* this is a synchronous wrapper around a bunch of asynchronous requests to + * primary process. this will initiate a request and wait until responses come. + */ +int +request_to_primary(struct malloc_mp_req *user_req) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param; + struct mp_request *entry; + struct timespec ts; + struct timeval now; + int ret; + + memset(&msg, 0, sizeof(msg)); + memset(&ts, 0, sizeof(ts)); + + pthread_mutex_lock(&mp_request_list.lock); + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n"); + goto fail; + } + + memset(entry, 0, sizeof(*entry)); + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto fail; + } + + ts.tv_nsec = (now.tv_usec * 1000) % 1000000000; + ts.tv_sec = now.tv_sec + MP_TIMEOUT_S + + (now.tv_usec * 1000) / 1000000000; + + /* initialize the request */ + pthread_cond_init(&entry->cond, NULL); + + msg.num_fds = 0; + msg.len_param = sizeof(*msg_req); + strlcpy(msg.name, MP_ACTION_REQUEST, sizeof(msg.name)); + + /* (attempt to) get a unique id */ + user_req->id = get_unique_id(); + + /* copy contents of user request into the message */ + memcpy(msg_req, user_req, sizeof(*msg_req)); + + if (rte_mp_sendmsg(&msg)) { + RTE_LOG(ERR, EAL, "Cannot send message to primary\n"); + goto fail; + } + + /* copy contents of user request into active request */ + memcpy(&entry->user_req, user_req, sizeof(*user_req)); + + /* mark request as in progress */ + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + + /* finally, wait on timeout */ + do { + ret = pthread_cond_timedwait(&entry->cond, + &mp_request_list.lock, &ts); + } while (ret != 0 && ret != ETIMEDOUT); + + if (entry->state != REQ_STATE_COMPLETE) { + RTE_LOG(ERR, EAL, "Request timed out\n"); + ret = -1; + } else { + ret = 0; + user_req->result = entry->user_req.result; + } + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return ret; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +int +register_mp_requests(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_REQUEST); + return -1; + } + } else { + if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_RESPONSE, + handle_response)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_RESPONSE); + return -1; + } + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h new file mode 100644 index 00000000..2b86b76f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef MALLOC_MP_H +#define MALLOC_MP_H + +#include +#include + +#include +#include +#include +#include + +/* forward declarations */ +struct malloc_heap; +struct rte_memseg; + +/* multiprocess synchronization structures for malloc */ +enum malloc_req_type { + REQ_TYPE_ALLOC, /**< ask primary to allocate */ + REQ_TYPE_FREE, /**< ask primary to free */ + REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */ +}; + +enum malloc_req_result { + REQ_RESULT_SUCCESS, + REQ_RESULT_FAIL +}; + +struct malloc_req_alloc { + struct malloc_heap *heap; + uint64_t page_sz; + size_t elt_size; + int socket; + unsigned int flags; + size_t align; + size_t bound; + bool contig; +}; + +struct malloc_req_free { + RTE_STD_C11 + union { + void *addr; + uint64_t addr_64; + }; + uint64_t len; +}; + +struct malloc_mp_req { + enum malloc_req_type t; + RTE_STD_C11 + union { + struct malloc_req_alloc alloc_req; + struct malloc_req_free free_req; + }; + uint64_t id; /**< not to be populated by caller */ + enum malloc_req_result result; +}; + +int +register_mp_requests(void); + +int +request_to_primary(struct malloc_mp_req *req); + +/* synchronous memory map sync request */ +int +request_sync(void); + +/* functions from malloc_heap exposed here */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len); + +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs); + +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len); + +#endif /* MALLOC_MP_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/meson.build b/src/spdk/dpdk/lib/librte_eal/common/meson.build new file mode 100644 index 00000000..56005bea --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/meson.build @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +eal_inc += include_directories('.', 'include', + join_paths('include/arch', arch_subdir)) + +common_objs = [] +common_sources = files( + 'eal_common_bus.c', + 'eal_common_cpuflags.c', + 'eal_common_class.c', + 'eal_common_devargs.c', + 'eal_common_dev.c', + 'eal_common_errno.c', + 'eal_common_fbarray.c', + 'eal_common_hexdump.c', + 'eal_common_launch.c', + 'eal_common_lcore.c', + 'eal_common_log.c', + 'eal_common_memalloc.c', + 'eal_common_memory.c', + 'eal_common_memzone.c', + 'eal_common_options.c', + 'eal_common_proc.c', + 'eal_common_string_fns.c', + 'eal_common_tailqs.c', + 'eal_common_thread.c', + 'eal_common_timer.c', + 'eal_common_uuid.c', + 'malloc_elem.c', + 'malloc_heap.c', + 'malloc_mp.c', + 'rte_keepalive.c', + 'rte_malloc.c', + 'rte_reciprocal.c', + 'rte_service.c' +) + +# get architecture specific sources and objs +eal_common_arch_sources = [] +eal_common_arch_objs = [] +subdir(join_paths('arch', arch_subdir)) +common_sources += eal_common_arch_sources +common_objs += eal_common_arch_objs + +common_headers = files( + 'include/rte_alarm.h', + 'include/rte_branch_prediction.h', + 'include/rte_bus.h', + 'include/rte_bitmap.h', + 'include/rte_class.h', + 'include/rte_common.h', + 'include/rte_debug.h', + 'include/rte_devargs.h', + 'include/rte_dev.h', + 'include/rte_eal.h', + 'include/rte_eal_memconfig.h', + 'include/rte_eal_interrupts.h', + 'include/rte_errno.h', + 'include/rte_fbarray.h', + 'include/rte_hexdump.h', + 'include/rte_interrupts.h', + 'include/rte_keepalive.h', + 'include/rte_launch.h', + 'include/rte_lcore.h', + 'include/rte_log.h', + 'include/rte_malloc.h', + 'include/rte_malloc_heap.h', + 'include/rte_memory.h', + 'include/rte_memzone.h', + 'include/rte_pci_dev_feature_defs.h', + 'include/rte_pci_dev_features.h', + 'include/rte_per_lcore.h', + 'include/rte_random.h', + 'include/rte_reciprocal.h', + 'include/rte_service.h', + 'include/rte_service_component.h', + 'include/rte_string_fns.h', + 'include/rte_tailq.h', + 'include/rte_time.h', + 'include/rte_uuid.h', + 'include/rte_version.h') + +# special case install the generic headers, since they go in a subdir +generic_headers = files( + 'include/generic/rte_atomic.h', + 'include/generic/rte_byteorder.h', + 'include/generic/rte_cpuflags.h', + 'include/generic/rte_cycles.h', + 'include/generic/rte_io.h', + 'include/generic/rte_memcpy.h', + 'include/generic/rte_pause.h', + 'include/generic/rte_prefetch.h', + 'include/generic/rte_rwlock.h', + 'include/generic/rte_spinlock.h', + 'include/generic/rte_vect.h') +install_headers(generic_headers, subdir: 'generic') + +# get and install the architecture specific headers +subdir(join_paths('include/arch', arch_subdir)) diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c b/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c new file mode 100644 index 00000000..e0494b20 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2016 Intel Corporation + */ + +#include + +#include +#include +#include +#include +#include +#include + +struct rte_keepalive { + /** Core Liveness. */ + struct { + /* + * Each element must be cache aligned to prevent false sharing. + */ + enum rte_keepalive_state core_state __rte_cache_aligned; + } live_data[RTE_KEEPALIVE_MAXCORES]; + + /** Last-seen-alive timestamps */ + uint64_t last_alive[RTE_KEEPALIVE_MAXCORES]; + + /** + * Cores to check. + * Indexed by core id, non-zero if the core should be checked. + */ + uint8_t active_cores[RTE_KEEPALIVE_MAXCORES]; + + /** Dead core handler. */ + rte_keepalive_failure_callback_t callback; + + /** + * Dead core handler app data. + * Pointer is passed to dead core handler. + */ + void *callback_data; + uint64_t tsc_initial; + uint64_t tsc_mhz; + + /** Core state relay handler. */ + rte_keepalive_relay_callback_t relay_callback; + + /** + * Core state relay handler app data. + * Pointer is passed to live core handler. + */ + void *relay_callback_data; +}; + +static void +print_trace(const char *msg, struct rte_keepalive *keepcfg, int idx_core) +{ + RTE_LOG(INFO, EAL, "%sLast seen %" PRId64 "ms ago.\n", + msg, + ((rte_rdtsc() - keepcfg->last_alive[idx_core])*1000) + / rte_get_tsc_hz() + ); +} + +void +rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer, + void *ptr_data) +{ + struct rte_keepalive *keepcfg = ptr_data; + int idx_core; + + for (idx_core = 0; idx_core < RTE_KEEPALIVE_MAXCORES; idx_core++) { + if (keepcfg->active_cores[idx_core] == 0) + continue; + + switch (keepcfg->live_data[idx_core].core_state) { + case RTE_KA_STATE_UNUSED: + break; + case RTE_KA_STATE_ALIVE: /* Alive */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_MISSING; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_MISSING: /* MIA */ + print_trace("Core MIA. ", keepcfg, idx_core); + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_DEAD; + break; + case RTE_KA_STATE_DEAD: /* Dead */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_GONE; + print_trace("Core died. ", keepcfg, idx_core); + if (keepcfg->callback) + keepcfg->callback( + keepcfg->callback_data, + idx_core + ); + break; + case RTE_KA_STATE_GONE: /* Buried */ + break; + case RTE_KA_STATE_DOZING: /* Core going idle */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_SLEEP; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_SLEEP: /* Idled core */ + break; + } + if (keepcfg->relay_callback) + keepcfg->relay_callback( + keepcfg->relay_callback_data, + idx_core, + keepcfg->live_data[idx_core].core_state, + keepcfg->last_alive[idx_core] + ); + } +} + +struct rte_keepalive * +rte_keepalive_create(rte_keepalive_failure_callback_t callback, + void *data) +{ + struct rte_keepalive *keepcfg; + + keepcfg = rte_zmalloc("RTE_EAL_KEEPALIVE", + sizeof(struct rte_keepalive), + RTE_CACHE_LINE_SIZE); + if (keepcfg != NULL) { + keepcfg->callback = callback; + keepcfg->callback_data = data; + keepcfg->tsc_initial = rte_rdtsc(); + keepcfg->tsc_mhz = rte_get_tsc_hz() / 1000; + } + return keepcfg; +} + +void rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data) +{ + keepcfg->relay_callback = callback; + keepcfg->relay_callback_data = data; +} + +void +rte_keepalive_register_core(struct rte_keepalive *keepcfg, const int id_core) +{ + if (id_core < RTE_KEEPALIVE_MAXCORES) { + keepcfg->active_cores[id_core] = RTE_KA_STATE_ALIVE; + keepcfg->last_alive[id_core] = rte_rdtsc(); + } +} + +void +rte_keepalive_mark_alive(struct rte_keepalive *keepcfg) +{ + keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_ALIVE; +} + +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg) +{ + keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_DOZING; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c b/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c new file mode 100644 index 00000000..b51a6d11 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "malloc_elem.h" +#include "malloc_heap.h" + + +/* Free the memory space back to heap */ +void rte_free(void *addr) +{ + if (addr == NULL) return; + if (malloc_heap_free(malloc_elem_from_data(addr)) < 0) + RTE_LOG(ERR, EAL, "Error: Invalid memory\n"); +} + +/* + * Allocate memory on specified heap. + */ +void * +rte_malloc_socket(const char *type, size_t size, unsigned int align, + int socket_arg) +{ + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + /* Check socket parameter */ + if (socket_arg >= RTE_MAX_NUMA_NODES) + return NULL; + + return malloc_heap_alloc(type, size, socket_arg, 0, + align == 0 ? 1 : align, 0, false); +} + +/* + * Allocate memory on default heap. + */ +void * +rte_malloc(const char *type, size_t size, unsigned align) +{ + return rte_malloc_socket(type, size, align, SOCKET_ID_ANY); +} + +/* + * Allocate zero'd memory on specified heap. + */ +void * +rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket) +{ + return rte_malloc_socket(type, size, align, socket); +} + +/* + * Allocate zero'd memory on default heap. + */ +void * +rte_zmalloc(const char *type, size_t size, unsigned align) +{ + return rte_zmalloc_socket(type, size, align, SOCKET_ID_ANY); +} + +/* + * Allocate zero'd memory on specified heap. + */ +void * +rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket) +{ + return rte_zmalloc_socket(type, num * size, align, socket); +} + +/* + * Allocate zero'd memory on default heap. + */ +void * +rte_calloc(const char *type, size_t num, size_t size, unsigned align) +{ + return rte_zmalloc(type, num * size, align); +} + +/* + * Resize allocated memory. + */ +void * +rte_realloc(void *ptr, size_t size, unsigned align) +{ + if (ptr == NULL) + return rte_malloc(NULL, size, align); + + struct malloc_elem *elem = malloc_elem_from_data(ptr); + if (elem == NULL) { + RTE_LOG(ERR, EAL, "Error: memory corruption detected\n"); + return NULL; + } + + size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align); + /* check alignment matches first, and if ok, see if we can resize block */ + if (RTE_PTR_ALIGN(ptr,align) == ptr && + malloc_heap_resize(elem, size) == 0) + return ptr; + + /* either alignment is off, or we have no room to expand, + * so move data. */ + void *new_ptr = rte_malloc(NULL, size, align); + if (new_ptr == NULL) + return NULL; + const unsigned old_size = elem->size - MALLOC_ELEM_OVERHEAD; + rte_memcpy(new_ptr, ptr, old_size < size ? old_size : size); + rte_free(ptr); + + return new_ptr; +} + +int +rte_malloc_validate(const void *ptr, size_t *size) +{ + const struct malloc_elem *elem = malloc_elem_from_data(ptr); + if (!malloc_elem_cookies_ok(elem)) + return -1; + if (size != NULL) + *size = elem->size - elem->pad - MALLOC_ELEM_OVERHEAD; + return 0; +} + +/* + * Function to retrieve data for heap on given socket + */ +int +rte_malloc_get_socket_stats(int socket, + struct rte_malloc_socket_stats *socket_stats) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (socket >= RTE_MAX_NUMA_NODES || socket < 0) + return -1; + + return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats); +} + +/* + * Function to dump contents of all heaps + */ +void __rte_experimental +rte_malloc_dump_heaps(FILE *f) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + + for (idx = 0; idx < rte_socket_count(); idx++) { + unsigned int socket = rte_socket_id_by_idx(idx); + fprintf(f, "Heap on socket %i:\n", socket); + malloc_heap_dump(&mcfg->malloc_heaps[socket], f); + } + +} + +/* + * Print stats on memory type. If type is NULL, info on all types is printed + */ +void +rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) +{ + unsigned int socket; + struct rte_malloc_socket_stats sock_stats; + /* Iterate through all initialised heaps */ + for (socket=0; socket< RTE_MAX_NUMA_NODES; socket++) { + if ((rte_malloc_get_socket_stats(socket, &sock_stats) < 0)) + continue; + + fprintf(f, "Socket:%u\n", socket); + fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes); + fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes); + fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes); + fprintf(f, "\tGreatest_free_size:%zu,\n", + sock_stats.greatest_free_size); + fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count); + fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count); + } + return; +} + +/* + * TODO: Set limit to memory that can be allocated to memory type + */ +int +rte_malloc_set_limit(__rte_unused const char *type, + __rte_unused size_t max) +{ + return 0; +} + +/* + * Return the IO address of a virtual address obtained through rte_malloc + */ +rte_iova_t +rte_malloc_virt2iova(const void *addr) +{ + const struct rte_memseg *ms; + struct malloc_elem *elem = malloc_elem_from_data(addr); + + if (elem == NULL) + return RTE_BAD_IOVA; + + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t) addr; + + ms = rte_mem_virt2memseg(addr, elem->msl); + if (ms == NULL) + return RTE_BAD_IOVA; + + if (ms->iova == RTE_BAD_IOVA) + return RTE_BAD_IOVA; + + return ms->iova + RTE_PTR_DIFF(addr, ms->addr); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c b/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c new file mode 100644 index 00000000..d81b11db --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ +/*- + * BSD LICENSE + * + * Copyright(c) Hannes Frederic Sowa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include + +#include "rte_reciprocal.h" + +/* find largest set bit. + * portable and slow but does not matter for this usage. + */ +static inline int fls(uint32_t x) +{ + int b; + + for (b = 31; b >= 0; --b) { + if (x & (1u << b)) + return b + 1; + } + + return 0; +} + +struct rte_reciprocal rte_reciprocal_value(uint32_t d) +{ + struct rte_reciprocal R; + uint64_t m; + int l; + + l = fls(d - 1); + m = ((1ULL << 32) * ((1ULL << l) - d)); + m /= d; + + ++m; + R.m = m; + R.sh1 = RTE_MIN(l, 1); + R.sh2 = RTE_MAX(l - 1, 0); + + return R; +} + +/* + * Code taken from Hacker's Delight: + * http://www.hackersdelight.org/hdcodetxt/divlu.c.txt + * License permits inclusion here per: + * http://www.hackersdelight.org/permissions.htm + */ +static uint64_t +divide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) +{ + const uint64_t b = (1ULL << 32); /* Number base (16 bits). */ + uint64_t un1, un0, /* Norm. dividend LSD's. */ + vn1, vn0, /* Norm. divisor digits. */ + q1, q0, /* Quotient digits. */ + un64, un21, un10, /* Dividend digit pairs. */ + rhat; /* A remainder. */ + int s; /* Shift amount for norm. */ + + /* If overflow, set rem. to an impossible value. */ + if (u1 >= v) { + if (r != NULL) + *r = (uint64_t) -1; + return (uint64_t) -1; + } + + /* Count leading zeros. */ + s = __builtin_clzll(v); + if (s > 0) { + v = v << s; + un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); + un10 = u0 << s; + } else { + + un64 = u1 | u0; + un10 = u0; + } + + vn1 = v >> 32; + vn0 = v & 0xFFFFFFFF; + + un1 = un10 >> 32; + un0 = un10 & 0xFFFFFFFF; + + q1 = un64/vn1; + rhat = un64 - q1*vn1; +again1: + if (q1 >= b || q1*vn0 > b*rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat < b) + goto again1; + } + + un21 = un64*b + un1 - q1*v; + + q0 = un21/vn1; + rhat = un21 - q0*vn1; +again2: + if (q0 >= b || q0*vn0 > b*rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat < b) + goto again2; + } + + if (r != NULL) + *r = (un21*b + un0 - q0*v) >> s; + return q1*b + q0; +} + +struct rte_reciprocal_u64 +rte_reciprocal_value_u64(uint64_t d) +{ + struct rte_reciprocal_u64 R; + uint64_t m; + int l; + + l = 63 - __builtin_clzll(d); + + m = divide_128_div_64_to_64((1ULL << l), 0, d, NULL) << 1; + m = (1ULL << l) - d ? m + 2 : 1; + R.m = m; + + R.sh1 = l > 1 ? 1 : l; + R.sh2 = (l > 0) ? l : 0; + R.sh2 -= R.sh2 && (m == 1) ? 1 : 0; + + return R; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_service.c b/src/spdk/dpdk/lib/librte_eal/common/rte_service.c new file mode 100644 index 00000000..8767c722 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_service.c @@ -0,0 +1,892 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include "include/rte_service_component.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RTE_SERVICE_NUM_MAX 64 + +#define SERVICE_F_REGISTERED (1 << 0) +#define SERVICE_F_STATS_ENABLED (1 << 1) +#define SERVICE_F_START_CHECK (1 << 2) + +/* runstates for services and lcores, denoting if they are active or not */ +#define RUNSTATE_STOPPED 0 +#define RUNSTATE_RUNNING 1 + +/* internal representation of a service */ +struct rte_service_spec_impl { + /* public part of the struct */ + struct rte_service_spec spec; + + /* atomic lock that when set indicates a service core is currently + * running this service callback. When not set, a core may take the + * lock and then run the service callback. + */ + rte_atomic32_t execute_lock; + + /* API set/get-able variables */ + int8_t app_runstate; + int8_t comp_runstate; + uint8_t internal_flags; + + /* per service statistics */ + rte_atomic32_t num_mapped_cores; + uint64_t calls; + uint64_t cycles_spent; + uint8_t active_on_lcore[RTE_MAX_LCORE]; +} __rte_cache_aligned; + +/* the internal values of a service core */ +struct core_state { + /* map of services IDs are run on this core */ + uint64_t service_mask; + uint8_t runstate; /* running or stopped */ + uint8_t is_service_core; /* set if core is currently a service core */ + + uint64_t loops; + uint64_t calls_per_service[RTE_SERVICE_NUM_MAX]; +} __rte_cache_aligned; + +static uint32_t rte_service_count; +static struct rte_service_spec_impl *rte_services; +static struct core_state *lcore_states; +static uint32_t rte_service_library_initialized; + +int32_t rte_service_init(void) +{ + if (rte_service_library_initialized) { + printf("service library init() called, init flag %d\n", + rte_service_library_initialized); + return -EALREADY; + } + + rte_services = rte_calloc("rte_services", RTE_SERVICE_NUM_MAX, + sizeof(struct rte_service_spec_impl), + RTE_CACHE_LINE_SIZE); + if (!rte_services) { + printf("error allocating rte services array\n"); + goto fail_mem; + } + + lcore_states = rte_calloc("rte_service_core_states", RTE_MAX_LCORE, + sizeof(struct core_state), RTE_CACHE_LINE_SIZE); + if (!lcore_states) { + printf("error allocating core states array\n"); + goto fail_mem; + } + + int i; + int count = 0; + struct rte_config *cfg = rte_eal_get_configuration(); + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_config[i].core_role == ROLE_SERVICE) { + if ((unsigned int)i == cfg->master_lcore) + continue; + rte_service_lcore_add(i); + count++; + } + } + + rte_service_library_initialized = 1; + return 0; +fail_mem: + if (rte_services) + rte_free(rte_services); + if (lcore_states) + rte_free(lcore_states); + return -ENOMEM; +} + +void +rte_service_finalize(void) +{ + if (!rte_service_library_initialized) + return; + + if (rte_services) + rte_free(rte_services); + + if (lcore_states) + rte_free(lcore_states); + + rte_service_library_initialized = 0; +} + +/* returns 1 if service is registered and has not been unregistered + * Returns 0 if service never registered, or has been unregistered + */ +static inline int +service_valid(uint32_t id) +{ + return !!(rte_services[id].internal_flags & SERVICE_F_REGISTERED); +} + +/* validate ID and retrieve service pointer, or return error value */ +#define SERVICE_VALID_GET_OR_ERR_RET(id, service, retval) do { \ + if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) \ + return retval; \ + service = &rte_services[id]; \ +} while (0) + +/* returns 1 if statistics should be collected for service + * Returns 0 if statistics should not be collected for service + */ +static inline int +service_stats_enabled(struct rte_service_spec_impl *impl) +{ + return !!(impl->internal_flags & SERVICE_F_STATS_ENABLED); +} + +static inline int +service_mt_safe(struct rte_service_spec_impl *s) +{ + return !!(s->spec.capabilities & RTE_SERVICE_CAP_MT_SAFE); +} + +int32_t +rte_service_set_stats_enable(uint32_t id, int32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + + if (enabled) + s->internal_flags |= SERVICE_F_STATS_ENABLED; + else + s->internal_flags &= ~(SERVICE_F_STATS_ENABLED); + + return 0; +} + +int32_t +rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + + if (enabled) + s->internal_flags |= SERVICE_F_START_CHECK; + else + s->internal_flags &= ~(SERVICE_F_START_CHECK); + + return 0; +} + +uint32_t +rte_service_get_count(void) +{ + return rte_service_count; +} + +int32_t +rte_service_get_by_name(const char *name, uint32_t *service_id) +{ + if (!service_id) + return -EINVAL; + + int i; + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (service_valid(i) && + strcmp(name, rte_services[i].spec.name) == 0) { + *service_id = i; + return 0; + } + } + + return -ENODEV; +} + +const char * +rte_service_get_name(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + return s->spec.name; +} + +int32_t +rte_service_probe_capability(uint32_t id, uint32_t capability) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + return !!(s->spec.capabilities & capability); +} + +int32_t +rte_service_component_register(const struct rte_service_spec *spec, + uint32_t *id_ptr) +{ + uint32_t i; + int32_t free_slot = -1; + + if (spec->callback == NULL || strlen(spec->name) == 0) + return -EINVAL; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) { + free_slot = i; + break; + } + } + + if ((free_slot < 0) || (i == RTE_SERVICE_NUM_MAX)) + return -ENOSPC; + + struct rte_service_spec_impl *s = &rte_services[free_slot]; + s->spec = *spec; + s->internal_flags |= SERVICE_F_REGISTERED | SERVICE_F_START_CHECK; + + rte_smp_wmb(); + rte_service_count++; + + if (id_ptr) + *id_ptr = free_slot; + + return 0; +} + +int32_t +rte_service_component_unregister(uint32_t id) +{ + uint32_t i; + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + rte_service_count--; + rte_smp_wmb(); + + s->internal_flags &= ~(SERVICE_F_REGISTERED); + + /* clear the run-bit in all cores */ + for (i = 0; i < RTE_MAX_LCORE; i++) + lcore_states[i].service_mask &= ~(UINT64_C(1) << id); + + memset(&rte_services[id], 0, sizeof(struct rte_service_spec_impl)); + + return 0; +} + +int32_t +rte_service_component_runstate_set(uint32_t id, uint32_t runstate) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + if (runstate) + s->comp_runstate = RUNSTATE_RUNNING; + else + s->comp_runstate = RUNSTATE_STOPPED; + + rte_smp_wmb(); + return 0; +} + +int32_t +rte_service_runstate_set(uint32_t id, uint32_t runstate) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + if (runstate) + s->app_runstate = RUNSTATE_RUNNING; + else + s->app_runstate = RUNSTATE_STOPPED; + + rte_smp_wmb(); + return 0; +} + +int32_t +rte_service_runstate_get(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + rte_smp_rmb(); + + int check_disabled = !(s->internal_flags & SERVICE_F_START_CHECK); + int lcore_mapped = (rte_atomic32_read(&s->num_mapped_cores) > 0); + + return (s->app_runstate == RUNSTATE_RUNNING) && + (s->comp_runstate == RUNSTATE_RUNNING) && + (check_disabled | lcore_mapped); +} + +static inline void +rte_service_runner_do_callback(struct rte_service_spec_impl *s, + struct core_state *cs, uint32_t service_idx) +{ + void *userdata = s->spec.callback_userdata; + + if (service_stats_enabled(s)) { + uint64_t start = rte_rdtsc(); + s->spec.callback(userdata); + uint64_t end = rte_rdtsc(); + s->cycles_spent += end - start; + cs->calls_per_service[service_idx]++; + s->calls++; + } else + s->spec.callback(userdata); +} + + +static inline int32_t +service_run(uint32_t i, int lcore, struct core_state *cs, uint64_t service_mask) +{ + if (!service_valid(i)) + return -EINVAL; + struct rte_service_spec_impl *s = &rte_services[i]; + if (s->comp_runstate != RUNSTATE_RUNNING || + s->app_runstate != RUNSTATE_RUNNING || + !(service_mask & (UINT64_C(1) << i))) { + s->active_on_lcore[lcore] = 0; + return -ENOEXEC; + } + + s->active_on_lcore[lcore] = 1; + + /* check do we need cmpset, if MT safe or <= 1 core + * mapped, atomic ops are not required. + */ + const int use_atomics = (service_mt_safe(s) == 0) && + (rte_atomic32_read(&s->num_mapped_cores) > 1); + if (use_atomics) { + if (!rte_atomic32_cmpset((uint32_t *)&s->execute_lock, 0, 1)) + return -EBUSY; + + rte_service_runner_do_callback(s, cs, i); + rte_atomic32_clear(&s->execute_lock); + } else + rte_service_runner_do_callback(s, cs, i); + + return 0; +} + +int32_t __rte_experimental +rte_service_may_be_active(uint32_t id) +{ + uint32_t ids[RTE_MAX_LCORE] = {0}; + struct rte_service_spec_impl *s = &rte_services[id]; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + int i; + + if (!service_valid(id)) + return -EINVAL; + + for (i = 0; i < lcore_count; i++) { + if (s->active_on_lcore[ids[i]]) + return 1; + } + + return 0; +} + +int32_t rte_service_run_iter_on_app_lcore(uint32_t id, + uint32_t serialize_mt_unsafe) +{ + /* run service on calling core, using all-ones as the service mask */ + if (!service_valid(id)) + return -EINVAL; + + struct core_state *cs = &lcore_states[rte_lcore_id()]; + struct rte_service_spec_impl *s = &rte_services[id]; + + /* Atomically add this core to the mapped cores first, then examine if + * we can run the service. This avoids a race condition between + * checking the value, and atomically adding to the mapped count. + */ + if (serialize_mt_unsafe) + rte_atomic32_inc(&s->num_mapped_cores); + + if (service_mt_safe(s) == 0 && + rte_atomic32_read(&s->num_mapped_cores) > 1) { + if (serialize_mt_unsafe) + rte_atomic32_dec(&s->num_mapped_cores); + return -EBUSY; + } + + int ret = service_run(id, rte_lcore_id(), cs, UINT64_MAX); + + if (serialize_mt_unsafe) + rte_atomic32_dec(&s->num_mapped_cores); + + return ret; +} + +static int32_t +rte_service_runner_func(void *arg) +{ + RTE_SET_USED(arg); + uint32_t i; + const int lcore = rte_lcore_id(); + struct core_state *cs = &lcore_states[lcore]; + + while (lcore_states[lcore].runstate == RUNSTATE_RUNNING) { + const uint64_t service_mask = cs->service_mask; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + /* return value ignored as no change to code flow */ + service_run(i, lcore, cs, service_mask); + } + + cs->loops++; + + rte_smp_rmb(); + } + + lcore_config[lcore].state = WAIT; + + return 0; +} + +int32_t +rte_service_lcore_count(void) +{ + int32_t count = 0; + uint32_t i; + for (i = 0; i < RTE_MAX_LCORE; i++) + count += lcore_states[i].is_service_core; + return count; +} + +int32_t +rte_service_lcore_list(uint32_t array[], uint32_t n) +{ + uint32_t count = rte_service_lcore_count(); + if (count > n) + return -ENOMEM; + + if (!array) + return -EINVAL; + + uint32_t i; + uint32_t idx = 0; + for (i = 0; i < RTE_MAX_LCORE; i++) { + struct core_state *cs = &lcore_states[i]; + if (cs->is_service_core) { + array[idx] = i; + idx++; + } + } + + return count; +} + +int32_t +rte_service_lcore_count_services(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + return __builtin_popcountll(cs->service_mask); +} + +int32_t +rte_service_start_with_defaults(void) +{ + /* create a default mapping from cores to services, then start the + * services to make them transparent to unaware applications. + */ + uint32_t i; + int ret; + uint32_t count = rte_service_get_count(); + + int32_t lcore_iter = 0; + uint32_t ids[RTE_MAX_LCORE] = {0}; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + + if (lcore_count == 0) + return -ENOTSUP; + + for (i = 0; (int)i < lcore_count; i++) + rte_service_lcore_start(ids[i]); + + for (i = 0; i < count; i++) { + /* do 1:1 core mapping here, with each service getting + * assigned a single core by default. Adding multiple services + * should multiplex to a single core, or 1:1 if there are the + * same amount of services as service-cores + */ + ret = rte_service_map_lcore_set(i, ids[lcore_iter], 1); + if (ret) + return -ENODEV; + + lcore_iter++; + if (lcore_iter >= lcore_count) + lcore_iter = 0; + + ret = rte_service_runstate_set(i, 1); + if (ret) + return -ENOEXEC; + } + + return 0; +} + +static int32_t +service_update(struct rte_service_spec *service, uint32_t lcore, + uint32_t *set, uint32_t *enabled) +{ + uint32_t i; + int32_t sid = -1; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if ((struct rte_service_spec *)&rte_services[i] == service && + service_valid(i)) { + sid = i; + break; + } + } + + if (sid == -1 || lcore >= RTE_MAX_LCORE) + return -EINVAL; + + if (!lcore_states[lcore].is_service_core) + return -EINVAL; + + uint64_t sid_mask = UINT64_C(1) << sid; + if (set) { + uint64_t lcore_mapped = lcore_states[lcore].service_mask & + sid_mask; + + if (*set && !lcore_mapped) { + lcore_states[lcore].service_mask |= sid_mask; + rte_atomic32_inc(&rte_services[sid].num_mapped_cores); + } + if (!*set && lcore_mapped) { + lcore_states[lcore].service_mask &= ~(sid_mask); + rte_atomic32_dec(&rte_services[sid].num_mapped_cores); + } + } + + if (enabled) + *enabled = !!(lcore_states[lcore].service_mask & (sid_mask)); + + rte_smp_wmb(); + + return 0; +} + +int32_t +rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + uint32_t on = enabled > 0; + return service_update(&s->spec, lcore, &on, 0); +} + +int32_t +rte_service_map_lcore_get(uint32_t id, uint32_t lcore) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + uint32_t enabled; + int ret = service_update(&s->spec, lcore, 0, &enabled); + if (ret == 0) + return enabled; + return ret; +} + +static void +set_lcore_state(uint32_t lcore, int32_t state) +{ + /* mark core state in hugepage backed config */ + struct rte_config *cfg = rte_eal_get_configuration(); + cfg->lcore_role[lcore] = state; + + /* mark state in process local lcore_config */ + lcore_config[lcore].core_role = state; + + /* update per-lcore optimized state tracking */ + lcore_states[lcore].is_service_core = (state == ROLE_SERVICE); +} + +int32_t +rte_service_lcore_reset_all(void) +{ + /* loop over cores, reset all to mask 0 */ + uint32_t i; + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_states[i].is_service_core) { + lcore_states[i].service_mask = 0; + set_lcore_state(i, ROLE_RTE); + lcore_states[i].runstate = RUNSTATE_STOPPED; + } + } + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) + rte_atomic32_set(&rte_services[i].num_mapped_cores, 0); + + rte_smp_wmb(); + + return 0; +} + +int32_t +rte_service_lcore_add(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + if (lcore_states[lcore].is_service_core) + return -EALREADY; + + set_lcore_state(lcore, ROLE_SERVICE); + + /* ensure that after adding a core the mask and state are defaults */ + lcore_states[lcore].service_mask = 0; + lcore_states[lcore].runstate = RUNSTATE_STOPPED; + + rte_smp_wmb(); + + return rte_eal_wait_lcore(lcore); +} + +int32_t +rte_service_lcore_del(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -EINVAL; + + if (cs->runstate != RUNSTATE_STOPPED) + return -EBUSY; + + set_lcore_state(lcore, ROLE_RTE); + + rte_smp_wmb(); + return 0; +} + +int32_t +rte_service_lcore_start(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -EINVAL; + + if (cs->runstate == RUNSTATE_RUNNING) + return -EALREADY; + + /* set core to run state first, and then launch otherwise it will + * return immediately as runstate keeps it in the service poll loop + */ + lcore_states[lcore].runstate = RUNSTATE_RUNNING; + + int ret = rte_eal_remote_launch(rte_service_runner_func, 0, lcore); + /* returns -EBUSY if the core is already launched, 0 on success */ + return ret; +} + +int32_t +rte_service_lcore_stop(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + if (lcore_states[lcore].runstate == RUNSTATE_STOPPED) + return -EALREADY; + + uint32_t i; + uint64_t service_mask = lcore_states[lcore].service_mask; + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + int32_t enabled = service_mask & (UINT64_C(1) << i); + int32_t service_running = rte_service_runstate_get(i); + int32_t only_core = (1 == + rte_atomic32_read(&rte_services[i].num_mapped_cores)); + + /* if the core is mapped, and the service is running, and this + * is the only core that is mapped, the service would cease to + * run if this core stopped, so fail instead. + */ + if (enabled && service_running && only_core) + return -EBUSY; + } + + lcore_states[lcore].runstate = RUNSTATE_STOPPED; + + return 0; +} + +int32_t +rte_service_attr_get(uint32_t id, uint32_t attr_id, uint32_t *attr_value) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + if (!attr_value) + return -EINVAL; + + switch (attr_id) { + case RTE_SERVICE_ATTR_CYCLES: + *attr_value = s->cycles_spent; + return 0; + case RTE_SERVICE_ATTR_CALL_COUNT: + *attr_value = s->calls; + return 0; + default: + return -EINVAL; + } +} + +int32_t __rte_experimental +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE || !attr_value) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + switch (attr_id) { + case RTE_SERVICE_LCORE_ATTR_LOOPS: + *attr_value = cs->loops; + return 0; + default: + return -EINVAL; + } +} + +static void +rte_service_dump_one(FILE *f, struct rte_service_spec_impl *s, + uint64_t all_cycles, uint32_t reset) +{ + /* avoid divide by zero */ + if (all_cycles == 0) + all_cycles = 1; + + int calls = 1; + if (s->calls != 0) + calls = s->calls; + + if (reset) { + s->cycles_spent = 0; + s->calls = 0; + return; + } + + fprintf(f, " %s: stats %d\tcalls %"PRIu64"\tcycles %" + PRIu64"\tavg: %"PRIu64"\n", + s->spec.name, service_stats_enabled(s), s->calls, + s->cycles_spent, s->cycles_spent / calls); +} + +int32_t +rte_service_attr_reset_all(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + int reset = 1; + rte_service_dump_one(NULL, s, 0, reset); + return 0; +} + +int32_t __rte_experimental +rte_service_lcore_attr_reset_all(uint32_t lcore) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + cs->loops = 0; + + return 0; +} + +static void +service_dump_calls_per_lcore(FILE *f, uint32_t lcore, uint32_t reset) +{ + uint32_t i; + struct core_state *cs = &lcore_states[lcore]; + + fprintf(f, "%02d\t", lcore); + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + fprintf(f, "%"PRIu64"\t", cs->calls_per_service[i]); + if (reset) + cs->calls_per_service[i] = 0; + } + fprintf(f, "\n"); +} + +int32_t +rte_service_dump(FILE *f, uint32_t id) +{ + uint32_t i; + int print_one = (id != UINT32_MAX); + + uint64_t total_cycles = 0; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + total_cycles += rte_services[i].cycles_spent; + } + + /* print only the specified service */ + if (print_one) { + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + fprintf(f, "Service %s Summary\n", s->spec.name); + uint32_t reset = 0; + rte_service_dump_one(f, s, total_cycles, reset); + return 0; + } + + /* print all services, as UINT32_MAX was passed as id */ + fprintf(f, "Services Summary\n"); + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + uint32_t reset = 0; + rte_service_dump_one(f, &rte_services[i], total_cycles, reset); + } + + fprintf(f, "Service Cores Summary\n"); + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_config[i].core_role != ROLE_SERVICE) + continue; + + uint32_t reset = 0; + service_dump_calls_per_lcore(f, i, reset); + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/Makefile b/src/spdk/dpdk/lib/librte_eal/linuxapp/Makefile new file mode 100644 index 00000000..a0fffa98 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal +DEPDIRS-kni := eal + +CFLAGS += -DALLOW_EXPERIMENTAL_API + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/Makefile b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/Makefile new file mode 100644 index 00000000..fd92c75c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/Makefile @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2016 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) + +EXPORT_MAP := ../../rte_eal_version.map +VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) + +LIBABIVER := 8 + +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -ldl +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrt +LDLIBS += -lrte_kvargs +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) +LDLIBS += -lnuma +endif + +# specific to linuxapp exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +CFLAGS_eal.o := -D_GNU_SOURCE +CFLAGS_eal_interrupts.o := -D_GNU_SOURCE +CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE +CFLAGS_eal_timer.o := -D_GNU_SOURCE +CFLAGS_eal_lcore.o := -D_GNU_SOURCE +CFLAGS_eal_memalloc.o := -D_GNU_SOURCE +CFLAGS_eal_thread.o := -D_GNU_SOURCE +CFLAGS_eal_log.o := -D_GNU_SOURCE +CFLAGS_eal_common_log.o := -D_GNU_SOURCE +CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE +CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE +CFLAGS_eal_common_options.o := -D_GNU_SOURCE +CFLAGS_eal_common_thread.o := -D_GNU_SOURCE +CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE +CFLAGS_rte_cycles.o := -D_GNU_SOURCE + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +endif + +INC := rte_kni_common.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \ + $(addprefix include/exec-env/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal.c new file mode 100644 index 00000000..43563551 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal.c @@ -0,0 +1,1172 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2012-2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(RTE_ARCH_X86) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" +#include "eal_vfio.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, internal_config.hugefile_prefix); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +const char * +eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static void +rte_eal_config_create(void) +{ + void *rte_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + /* map the config before hugepage address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE)); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); + if (retval < 0){ + close(mem_cfg_fd); + rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + } + + rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config), + PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); + + if (rte_mem_cfg_addr == MAP_FAILED){ + rte_panic("Cannot mmap memory for rte_config\n"); + } + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + +} + +/* attach to an existing shared memory config */ +static void +rte_eal_config_attach(void) +{ + struct rte_mem_config *mem_config; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + /* map it as read-only first */ + mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + if (mem_config == MAP_FAILED) + rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + + rte_config.mem_config = mem_config; +} + +/* reattach the shared config at exact memory location primary process has it */ +static void +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) + /* errno is stale, don't use */ + rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--base-virtaddr' option\n", + rte_mem_cfg_addr, mem_config); + else + rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + } + close(mem_cfg_fd); + + rte_config.mem_config = mem_config; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static void +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + rte_eal_config_create(); + break; + case RTE_PROC_SECONDARY: + rte_eal_config_attach(); + rte_eal_mcfg_wait_complete(rte_config.mem_config); + rte_eal_config_reattach(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + rte_panic("Invalid process type\n"); + } +} + +/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ +static void +eal_hugedirs_unlock(void) +{ + int i; + + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + { + /* skip uninitialized */ + if (internal_config.hugepage_info[i].lock_descriptor < 0) + continue; + /* unlock hugepage file */ + flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); + close(internal_config.hugepage_info[i].lock_descriptor); + /* reset the field */ + internal_config.hugepage_info[i].lock_descriptor = -1; + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + printf("EAL Linux options:\n" + " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" + " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" + " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" + " --"OPT_BASE_VIRTADDR" Base virtual address\n" + " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" + " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" + " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" + "\n"); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static int +eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(strval, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(strval[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(strval, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + uint64_t val; + end = NULL; + val = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + val <<= 20; + total_mem += val; + socket_arg[i] = val; + } + + /* check if we have a positive amount of total memory */ + if (total_mem == 0) + return -1; + + return 0; +} + +static int +eal_parse_base_virtaddr(const char *arg) +{ + char *end; + uint64_t addr; + + errno = 0; + addr = strtoull(arg, &end, 16); + + /* check for errors */ + if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) + return -1; + + /* make sure we don't exceed 32-bit boundary on 32-bit target */ +#ifndef RTE_ARCH_64 + if (addr >= UINTPTR_MAX) + return -1; +#endif + + /* align the addr on 16M boundary, 16MB is the minimum huge page + * size on IBM Power architecture. If the addr is aligned to 16MB, + * it can align to 2MB for x86. So this alignment can also be used + * on x86 */ + internal_config.base_virtaddr = + RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); + + return 0; +} + +static int +eal_parse_vfio_intr(const char *mode) +{ + unsigned i; + static struct { + const char *name; + enum rte_intr_mode value; + } map[] = { + { "legacy", RTE_INTR_MODE_LEGACY }, + { "msi", RTE_INTR_MODE_MSI }, + { "msix", RTE_INTR_MODE_MSIX }, + }; + + for (i = 0; i < RTE_DIM(map); i++) { + if (!strcmp(mode, map[i].name)) { + internal_config.vfio_intr_mode = map[i].value; + return 0; + } + } + return -1; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* getopt is not happy, stop right now */ + if (opt == '?') { + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + + case OPT_HUGE_DIR_NUM: + internal_config.hugepage_dir = strdup(optarg); + break; + + case OPT_FILE_PREFIX_NUM: + internal_config.hugefile_prefix = strdup(optarg); + break; + + case OPT_SOCKET_MEM_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_mem) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_sockets = 1; + break; + + case OPT_SOCKET_LIMIT_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_limit) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_LIMIT "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_socket_limits = 1; + break; + + case OPT_BASE_VIRTADDR_NUM: + if (eal_parse_base_virtaddr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_BASE_VIRTADDR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_VFIO_INTR_NUM: + if (eal_parse_vfio_intr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_VFIO_INTR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_CREATE_UIO_DEV_NUM: + internal_config.create_uio_dev = 1; + break; + + case OPT_MBUF_POOL_OPS_NAME_NUM: + internal_config.user_mbuf_pool_ops_name = + strdup(optarg); + break; + + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Linux\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Linux\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Linux\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + return *socket_id == msl->socket_id; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +inline static void +rte_eal_mcfg_complete(void) +{ + /* ALL shared mem_config related INIT DONE */ + if (rte_config.process_type == RTE_PROC_PRIMARY) + rte_config.mem_config->magic = RTE_MAGIC; + + internal_config.init_complete = 1; +} + +/* + * Request iopl privilege for all RPL, returns 0 on success + * iopl() call is mostly for the i386 architecture. For other architectures, + * return -1 to indicate IO privilege can't be changed in this way. + */ +int +rte_eal_iopl_init(void) +{ +#if defined(RTE_ARCH_X86) + if (iopl(3) != 0) + return -1; +#endif + return 0; +} + +#ifdef VFIO_PRESENT +static int rte_eal_vfio_setup(void) +{ + if (rte_vfio_enable("vfio")) + return -1; + + return 0; +} +#endif + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + char *logid_storage; + const char *logid; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + logid_storage = strrchr(argv[0], '/'); + logid_storage = strdup(logid_storage ? logid_storage + 1 : argv[0]); + logid = logid_storage; + + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + fctret = -1; + goto finished; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins\n"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } + + rte_config_init(); + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + fctret = -1; + goto finished; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0) { + rte_eal_init_alert("failed to init mp channel\n"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + fctret = -1; + goto finished; + } + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } + + /* autodetect the iova mapping mode (default is iova_pa) */ + rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); + + /* Workaround for KNI which requires physical address to work */ + if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, + "Some devices want IOVA as VA but PA will be used because.. " + "KNI module inserted\n"); + } + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + rte_srand(rte_rdtsc()); + + if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { + rte_eal_init_alert("Cannot init logging."); + rte_errno = ENOMEM; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } + +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) { + rte_eal_init_alert("Cannot init VFIO\n"); + rte_errno = EAGAIN; + rte_atomic32_clear(&run_once); + fctret = -1; + goto finished; + } +#endif + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone\n"); + rte_errno = ENODEV; + fctret = -1; + goto finished; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory\n"); + rte_errno = ENOMEM; + fctret = -1; + goto finished; + } + + /* the directories are locked during eal_hugepage_info_init */ + eal_hugedirs_unlock(); + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap\n"); + rte_errno = ENODEV; + fctret = -1; + goto finished; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_errno = EFAULT; + fctret = -1; + goto finished; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + fctret = -1; + goto finished; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_errno = ENOTSUP; + fctret = -1; + goto finished; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", + rte_config.master_lcore, (int)thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + ret = rte_thread_setname(lcore_config[i].thread_id, + thread_name); + if (ret != 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for lcore thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed\n"); + rte_errno = ENOEXEC; + fctret = -1; + goto finished; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices\n"); + rte_errno = ENOTSUP; + fctret = -1; + goto finished; + } + +#ifdef VFIO_PRESENT + /* Register mp action after probe() so that we got enough info */ + if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) { + fctret = -1; + goto finished; + } +#endif + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + fctret = -1; + goto finished; + } + + rte_eal_mcfg_complete(); + +finished: + free(logid_storage); + return fctret; +} + +static int +mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg __rte_unused) +{ + /* ms is const, so find this memseg */ + struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl); + + found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; + + return 0; +} + +int __rte_experimental +rte_eal_cleanup(void) +{ + /* if we're in a primary process, we need to mark hugepages as freeable + * so that finalization can release them back to the system. + */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_memseg_walk(mark_freeable, NULL); + rte_service_finalize(); + return 0; +} + +/* get core role */ +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned lcore_id) +{ + return rte_config.lcore_role[lcore_id]; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_hugepages(void) +{ + return ! internal_config.no_hugetlbfs; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return internal_config.vfio_intr_mode; +} + +int +rte_eal_check_module(const char *module_name) +{ + char sysfs_mod_name[PATH_MAX]; + struct stat st; + int n; + + if (NULL == module_name) + return -1; + + /* Check if there is sysfs mounted */ + if (stat("/sys/module", &st) != 0) { + RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + /* A module might be built-in, therefore try sysfs */ + n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); + if (n < 0 || n > PATH_MAX) { + RTE_LOG(DEBUG, EAL, "Could not format module path\n"); + return -1; + } + + if (stat(sysfs_mod_name, &st) != 0) { + RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", + sysfs_mod_name, errno, strerror(errno)); + return 0; + } + + /* Module has been found */ + return 1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c new file mode 100644 index 00000000..391d2a65 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef TFD_NONBLOCK +#include +#define TFD_NONBLOCK O_NONBLOCK +#endif + +#define NS_PER_US 1000 +#define US_PER_MS 1000 +#define MS_PER_S 1000 +#define US_PER_S (US_PER_MS * MS_PER_S) + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct timeval time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static int handler_registered = 0; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + /* create a timerfd file descriptor */ + intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (intr_handle.fd == -1) + goto error; + + return 0; + +error: + rte_errno = errno; + return -1; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + while ((ap = LIST_FIRST(&alarm_list)) !=NULL && + clock_gettime(CLOCK_TYPE_ID, &now) == 0 && + (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && + (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + } + + if (!LIST_EMPTY(&alarm_list)) { + struct itimerspec atime = { .it_interval = { 0, 0 } }; + + ap = LIST_FIRST(&alarm_list); + atime.it_value.tv_sec = ap->time.tv_sec; + atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; + /* perform borrow for subtraction if necessary */ + if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) + atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; + + atime.it_value.tv_sec -= now.tv_sec; + atime.it_value.tv_nsec -= now.tv_nsec; + timerfd_settime(intr_handle.fd, 0, &atime, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); +} + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct timespec now; + int ret = 0; + struct alarm_entry *ap, *new_alarm; + + /* Check parameters, including that us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; + new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + if (!handler_registered) { + ret |= rte_intr_callback_register(&intr_handle, + eal_alarm_callback, NULL); + handler_registered = (ret == 0) ? 1 : 0; + } + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (ap->time.tv_sec > new_alarm->time.tv_sec || + (ap->time.tv_sec == new_alarm->time.tv_sec && + ap->time.tv_usec > new_alarm->time.tv_usec)){ + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + if (LIST_FIRST(&alarm_list) == new_alarm) { + struct itimerspec alarm_time = { + .it_interval = {0, 0}, + .it_value = { + .tv_sec = us / US_PER_S, + .tv_nsec = (us % US_PER_S) * NS_PER_US, + }, + }; + ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while ((ap = LIST_FIRST(&alarm_list)) != NULL && + cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that alarm is executing + * so loop can spin till it finish. Otherwise we are trying to + * cancel our self - mark it by EINPROGRESS */ + if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + return count; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_cpuflags.c new file mode 100644 index 00000000..d38296e1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_cpuflags.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 16) +#include +#define HAS_AUXV 1 +#endif +#endif + +#include + +#ifndef HAS_AUXV +static unsigned long +getauxval(unsigned long type __rte_unused) +{ + errno = ENOTSUP; + return 0; +} +#endif + +#ifdef RTE_ARCH_64 +typedef Elf64_auxv_t Internal_Elfx_auxv_t; +#else +typedef Elf32_auxv_t Internal_Elfx_auxv_t; +#endif + +/** + * Provides a method for retrieving values from the auxiliary vector and + * possibly running a string comparison. + * + * @return Always returns a result. When the result is 0, check errno + * to see if an error occurred during processing. + */ +static unsigned long +_rte_cpu_getauxval(unsigned long type, const char *str) +{ + unsigned long val; + + errno = 0; + val = getauxval(type); + + if (!val && (errno == ENOTSUP || errno == ENOENT)) { + int auxv_fd = open("/proc/self/auxv", O_RDONLY); + Internal_Elfx_auxv_t auxv; + + if (auxv_fd == -1) + return 0; + + errno = ENOENT; + while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv.a_type == type) { + errno = 0; + val = auxv.a_un.a_val; + if (str) + val = strcmp((const char *)val, str); + break; + } + } + close(auxv_fd); + } + + return val; +} + +unsigned long +rte_cpu_getauxval(unsigned long type) +{ + return _rte_cpu_getauxval(type, NULL); +} + +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str) +{ + return _rte_cpu_getauxval(type, str); +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c new file mode 100644 index 00000000..5d92500b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_dev.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_dev.c new file mode 100644 index 00000000..1cf6aebf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_dev.c @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static bool monitor_started; + +#define EAL_UEV_MSG_LEN 4096 +#define EAL_UEV_MSG_ELEM_LEN 128 + +static void dev_uev_handler(__rte_unused void *param); + +/* identify the system layer which reports this event. */ +enum eal_dev_event_subsystem { + EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ + EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_MAX +}; + +static int +dev_uev_socket_fd_create(void) +{ + struct sockaddr_nl addr; + int ret; + + intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | + SOCK_NONBLOCK, + NETLINK_KOBJECT_UEVENT); + if (intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = 0; + addr.nl_groups = 0xffffffff; + + ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); + goto err; + } + + return 0; +err: + close(intr_handle.fd); + intr_handle.fd = -1; + return ret; +} + +static int +dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) +{ + char action[EAL_UEV_MSG_ELEM_LEN]; + char subsystem[EAL_UEV_MSG_ELEM_LEN]; + char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; + int i = 0; + + memset(action, 0, EAL_UEV_MSG_ELEM_LEN); + memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); + memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); + + while (i < length) { + for (; i < length; i++) { + if (*buf) + break; + buf++; + } + /** + * check device uevent from kernel side, no need to check + * uevent from udev. + */ + if (!strncmp(buf, "libudev", 7)) { + buf += 7; + i += 7; + return -1; + } + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + i += 7; + strlcpy(action, buf, sizeof(action)); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + i += 10; + strlcpy(subsystem, buf, sizeof(subsystem)); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + i += 14; + strlcpy(pci_slot_name, buf, sizeof(subsystem)); + event->devname = strdup(pci_slot_name); + } + for (; i < length; i++) { + if (*buf == '\0') + break; + buf++; + } + } + + /* parse the subsystem layer */ + if (!strncmp(subsystem, "uio", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; + else if (!strncmp(subsystem, "pci", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; + else if (!strncmp(subsystem, "vfio", 4)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; + else + return -1; + + /* parse the action type */ + if (!strncmp(action, "add", 3)) + event->type = RTE_DEV_EVENT_ADD; + else if (!strncmp(action, "remove", 6)) + event->type = RTE_DEV_EVENT_REMOVE; + else + return -1; + return 0; +} + +static void +dev_delayed_unregister(void *param) +{ + rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); + close(intr_handle.fd); + intr_handle.fd = -1; +} + +static void +dev_uev_handler(__rte_unused void *param) +{ + struct rte_dev_event uevent; + int ret; + char buf[EAL_UEV_MSG_LEN]; + + memset(&uevent, 0, sizeof(struct rte_dev_event)); + memset(buf, 0, EAL_UEV_MSG_LEN); + + ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); + if (ret < 0 && errno == EAGAIN) + return; + else if (ret <= 0) { + /* connection is closed or broken, can not up again. */ + RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); + rte_eal_alarm_set(1, dev_delayed_unregister, NULL); + return; + } + + ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "It is not an valid event " + "that need to be handle.\n"); + return; + } + + RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", + uevent.devname, uevent.type, uevent.subsystem); + + if (uevent.devname) + dev_callback_process(uevent.devname, uevent.type); +} + +int __rte_experimental +rte_dev_event_monitor_start(void) +{ + int ret; + + if (monitor_started) + return 0; + + ret = dev_uev_socket_fd_create(); + if (ret) { + RTE_LOG(ERR, EAL, "error create device event fd.\n"); + return -1; + } + + intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; + ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); + + if (ret) { + RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); + return -1; + } + + monitor_started = true; + + return 0; +} + +int __rte_experimental +rte_dev_event_monitor_stop(void) +{ + int ret; + + if (!monitor_started) + return 0; + + ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, + (void *)-1); + if (ret < 0) { + RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); + return ret; + } + + close(intr_handle.fd); + intr_handle.fd = -1; + monitor_started = false; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c new file mode 100644 index 00000000..3a7d4b22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -0,0 +1,525 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* for hugetlb-related flags */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "rte_string_fns.h" +#include "eal_internal_cfg.h" +#include "eal_hugepages.h" +#include "eal_filesystem.h" + +static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; +static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +/* this function is only called from eal_hugepage_info_init which itself + * is only called from a primary process */ +static uint32_t +get_num_hugepages(const char *subdir) +{ + char path[PATH_MAX]; + long unsigned resv_pages, num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + const char *nr_rsvd_file = "resv_hugepages"; + + /* first, check how many reserved pages kernel reports */ + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_rsvd_file); + if (eal_parse_sysfs_value(path, &resv_pages) < 0) + return 0; + + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* adjust num_pages */ + if (num_pages >= resv_pages) + num_pages -= resv_pages; + else if (resv_pages) + num_pages = 0; + + /* we want to return a uint32_t and more than this looks suspicious + * anyway ... */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint32_t +get_num_hugepages_on_node(const char *subdir, unsigned int socket) +{ + char path[PATH_MAX], socketpath[PATH_MAX]; + DIR *socketdir; + unsigned long num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + + snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", + sys_pages_numa_dir_path, socket); + + socketdir = opendir(socketpath); + if (socketdir) { + /* Keep calm and carry on */ + closedir(socketdir); + } else { + /* Can't find socket dir, so ignore it */ + return 0; + } + + snprintf(path, sizeof(path), "%s/%s/%s", + socketpath, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* + * we want to return a uint32_t and more than this looks suspicious + * anyway ... + */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint64_t +get_default_hp_size(void) +{ + const char proc_meminfo[] = "/proc/meminfo"; + const char str_hugepagesz[] = "Hugepagesize:"; + unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; + char buffer[256]; + unsigned long long size = 0; + + FILE *fd = fopen(proc_meminfo, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_meminfo); + while(fgets(buffer, sizeof(buffer), fd)){ + if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ + size = rte_str_to_size(&buffer[hugepagesz_len]); + break; + } + } + fclose(fd); + if (size == 0) + rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); + return size; +} + +static int +get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) +{ + enum proc_mount_fieldnames { + DEVICE = 0, + MOUNTPT, + FSTYPE, + OPTIONS, + _FIELDNAME_MAX + }; + static uint64_t default_size = 0; + const char proc_mounts[] = "/proc/mounts"; + const char hugetlbfs_str[] = "hugetlbfs"; + const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; + const char pagesize_opt[] = "pagesize="; + const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; + const char split_tok = ' '; + char *splitstr[_FIELDNAME_MAX]; + char buf[BUFSIZ]; + int retval = -1; + + FILE *fd = fopen(proc_mounts, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_mounts); + + if (default_size == 0) + default_size = get_default_hp_size(); + + while (fgets(buf, sizeof(buf), fd)){ + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, + split_tok) != _FIELDNAME_MAX) { + RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); + break; /* return NULL */ + } + + /* we have a specified --huge-dir option, only examine that dir */ + if (internal_config.hugepage_dir != NULL && + strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) + continue; + + if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ + const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); + + /* if no explicit page size, the default page size is compared */ + if (pagesz_str == NULL){ + if (hugepage_sz == default_size){ + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + /* there is an explicit page size, so check it */ + else { + uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); + if (pagesz == hugepage_sz) { + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + } /* end if strncmp hugetlbfs */ + } /* end while fgets */ + + fclose(fd); + return retval; +} + +/* + * Clear the hugepage directory of whatever hugepage files + * there are. Checks if the file is locked (i.e. + * if it's in use by another DPDK process). + */ +static int +clear_hugedir(const char * hugedir) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + const char filter[] = "*map_*"; /* matches hugepage files */ + + /* open directory */ + dir = opendir(hugedir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", + hugedir); + goto error; + } + dir_fd = dirfd(dir); + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", + hugedir); + goto error; + } + + while(dirent != NULL){ + /* skip files that don't match the hugepage pattern */ + if (fnmatch(filter, dirent->d_name, 0) > 0) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close (fd); + dirent = readdir(dir); + } + + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", + strerror(errno)); + + return -1; +} + +static int +compare_hpi(const void *a, const void *b) +{ + const struct hugepage_info *hpi_a = a; + const struct hugepage_info *hpi_b = b; + + return hpi_b->hugepage_sz - hpi_a->hugepage_sz; +} + +static void +calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) +{ + uint64_t total_pages = 0; + unsigned int i; + + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) { + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + } +} + +static int +hugepage_info_init(void) +{ const char dirent_start_text[] = "hugepages-"; + const size_t dirent_start_len = sizeof(dirent_start_text) - 1; + unsigned int i, num_sizes = 0; + DIR *dir; + struct dirent *dirent; + + dir = opendir(sys_dir_path); + if (dir == NULL) { + RTE_LOG(ERR, EAL, + "Cannot open directory %s to read system hugepage info\n", + sys_dir_path); + return -1; + } + + for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { + struct hugepage_info *hpi; + + if (strncmp(dirent->d_name, dirent_start_text, + dirent_start_len) != 0) + continue; + + if (num_sizes >= MAX_HUGEPAGE_SIZES) + break; + + hpi = &internal_config.hugepage_info[num_sizes]; + hpi->hugepage_sz = + rte_str_to_size(&dirent->d_name[dirent_start_len]); + + /* first, check if we have a mountpoint */ + if (get_hugepage_dir(hpi->hugepage_sz, + hpi->hugedir, sizeof(hpi->hugedir)) < 0) { + uint32_t num_pages; + + num_pages = get_num_hugepages(dirent->d_name); + if (num_pages > 0) + RTE_LOG(NOTICE, EAL, + "%" PRIu32 " hugepages of size " + "%" PRIu64 " reserved, but no mounted " + "hugetlbfs found for that size\n", + num_pages, hpi->hugepage_sz); + /* if we have kernel support for reserving hugepages + * through mmap, and we're in in-memory mode, treat this + * page size as valid. we cannot be in legacy mode at + * this point because we've checked this earlier in the + * init process. + */ +#ifdef MAP_HUGE_SHIFT + if (internal_config.in_memory) { + RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " + "hugepages of size %" PRIu64 " bytes " + "will be allocated anonymously\n", + hpi->hugepage_sz); + calc_num_pages(hpi, dirent); + num_sizes++; + } +#endif + continue; + } + + /* try to obtain a writelock */ + hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); + + /* if blocking lock failed */ + if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { + RTE_LOG(CRIT, EAL, + "Failed to lock hugepage directory!\n"); + break; + } + /* clear out the hugepages dir from unused pages */ + if (clear_hugedir(hpi->hugedir) == -1) + break; + + calc_num_pages(hpi, dirent); + + num_sizes++; + } + closedir(dir); + + /* something went wrong, and we broke from the for loop above */ + if (dirent != NULL) + return -1; + + internal_config.num_hugepage_sizes = num_sizes; + + /* sort the page directory entries by size, largest to smallest */ + qsort(&internal_config.hugepage_info[0], num_sizes, + sizeof(internal_config.hugepage_info[0]), compare_hpi); + + /* now we have all info, check we have at least one valid size */ + for (i = 0; i < num_sizes; i++) { + /* pages may no longer all be on socket 0, so check all */ + unsigned int j, num_pages = 0; + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + num_pages += hpi->num_pages[j]; + if (num_pages > 0) + return 0; + } + + /* no valid hugepage mounts available, return error */ + return -1; +} + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + struct hugepage_info *hpi, *tmp_hpi; + unsigned int i; + + if (hugepage_info_init() < 0) + return -1; + + /* for no shared files mode, we're done */ + if (internal_config.no_shconf) + return 0; + + hpi = &internal_config.hugepage_info[0]; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} + +int eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c new file mode 100644 index 00000000..4076c6d6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -0,0 +1,1230 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_vfio.h" +#include "eal_thread.h" + +#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) +#define NB_OTHER_INTR 1 + +static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ + +/** + * union for pipe fds. + */ +union intr_pipefds{ + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; +}; + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + int uio_intr_count; /* for uio device */ +#ifdef VFIO_PRESENT + uint64_t vfio_intr_count; /* for vfio device */ +#endif + uint64_t timerfd_num; /* for timerfd */ + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* union buffer for pipe read/write */ +static union intr_pipefds intr_pipe; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +/* VFIO interrupts */ +#ifdef VFIO_PRESENT + +#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) +/* irq set buffer length for queue interrupts and LSC interrupt */ +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) + +/* enable legacy (INTx) interrupts */ +static int +vfio_enable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + /* enable INTx */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* unmask INTx after enabling */ + memset(irq_set, 0, len); + len = sizeof(struct vfio_irq_set); + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable legacy (INTx) interrupts */ +static int +vfio_disable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + /* mask interrupts before disabling */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* disable INTx*/ + memset(irq_set, 0, len); + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, + "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); + return -1; + } + return 0; +} + +/* enable MSI interrupts */ +static int +vfio_enable_msi(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable MSI interrupts */ +static int +vfio_disable_msi(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +/* enable MSI-X interrupts */ +static int +vfio_enable_msix(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ + irq_set->count = intr_handle->max_intr ? + (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? + RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + /* INTR vector offset 0 reserve for non-efds mapping */ + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; + memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, + sizeof(*intr_handle->efds) * intr_handle->nb_efd); + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable MSI-X interrupts */ +static int +vfio_disable_msix(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); + + return ret; +} +#endif + +static int +uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* disable interrupts */ + command_high |= 0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* enable interrupts */ + command_high &= ~0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intr_disable(const struct rte_intr_handle *intr_handle) +{ + const int value = 0; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +static int +uio_intr_enable(const struct rte_intr_handle *intr_handle) +{ + const int value = 1; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + int ret, wake_thread; + struct rte_intr_source *src; + struct rte_intr_callback *callback; + + wake_thread = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + wake_thread = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + free(callback); + ret = -ENOMEM; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + wake_thread = 1; + ret = 0; + } + } + + rte_spinlock_unlock(&intr_lock); + + /** + * check if need to notify the pipe fd waited by epoll_wait to + * rebuild the wait list. + */ + if (wake_thread) + if (write(intr_pipe.writefd, "1", 1) < 0) + return -EPIPE; + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + ret = 0; + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + + rte_spinlock_unlock(&intr_lock); + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + ret = -EPIPE; + } + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to enable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_enable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_enable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_enable_intx(intr_handle)) + return -1; + break; +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to disable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_disable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_disable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_disable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_disable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_disable_intx(intr_handle)) + return -1; + break; +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static int +eal_intr_process_interrupts(struct epoll_event *events, int nfds) +{ + bool call = false; + int n, bytes_read; + struct rte_intr_source *src; + struct rte_intr_callback *cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback active_cb; + + for (n = 0; n < nfds; n++) { + + /** + * if the pipe fd is ready to read, return out to + * rebuild the wait list. + */ + if (events[n].data.fd == intr_pipe.readfd){ + int r = read(intr_pipe.readfd, buf.charbuf, + sizeof(buf.charbuf)); + RTE_SET_USED(r); + return -1; + } + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == + events[n].data.fd) + break; + if (src == NULL){ + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; + case RTE_INTR_HANDLE_ALARM: + bytes_read = sizeof(buf.timerfd_num); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(events[n].data.fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + events[n].data.fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", events[n].data.fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + rte_spinlock_unlock(&intr_lock); + } + + return 0; +} + +/** + * It handles all the interrupts. + * + * @param pfd + * epoll file descriptor. + * @param totalfds + * The number of file descriptors added in epoll. + * + * @return + * void + */ +static void +eal_intr_handle_interrupts(int pfd, unsigned totalfds) +{ + struct epoll_event events[totalfds]; + int nfds = 0; + + for(;;) { + nfds = epoll_wait(pfd, events, totalfds, + EAL_INTR_EPOLL_WAIT_FOREVER); + /* epoll_wait fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "epoll_wait returns with fail\n"); + return; + } + /* epoll_wait timeout, will never happens here */ + else if (nfds == 0) + continue; + /* epoll_wait has at least one fd ready to read */ + if (eal_intr_process_interrupts(events, nfds) < 0) + return; + } +} + +/** + * It builds/rebuilds up the epoll file descriptor with all the + * file descriptors being waited on. Then handles the interrupts. + * + * @param arg + * pointer. (unused) + * + * @return + * never return; + */ +static __attribute__((noreturn)) void * +eal_intr_thread_main(__rte_unused void *arg) +{ + struct epoll_event ev; + + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to + * wait on then pass it to the handle_interrupts function + */ + static struct epoll_event pipe_event = { + .events = EPOLLIN | EPOLLPRI, + }; + struct rte_intr_source *src; + unsigned numfds = 0; + + /* create epoll fd */ + int pfd = epoll_create(1); + if (pfd < 0) + rte_panic("Cannot create epoll instance\n"); + + pipe_event.data.fd = intr_pipe.readfd; + /** + * add pipe fd into wait list, this pipe is used to + * rebuild the wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, + &pipe_event) < 0) { + rte_panic("Error adding fd to %d epoll_ctl, %s\n", + intr_pipe.readfd, strerror(errno)); + } + numfds++; + + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ + ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; + ev.data.fd = src->intr_handle.fd; + + /** + * add all the uio device file descriptor + * into wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, + src->intr_handle.fd, &ev) < 0){ + rte_panic("Error adding fd %d epoll_ctl, %s\n", + src->intr_handle.fd, strerror(errno)); + } + else + numfds++; + } + rte_spinlock_unlock(&intr_lock); + /* serve the interrupt */ + eal_intr_handle_interrupts(pfd, numfds); + + /** + * when we return, we need to rebuild the + * list of fds to monitor. + */ + close(pfd); + } +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + /** + * create a pipe which will be waited by epoll and notified to + * rebuild the wait list of epoll. + */ + if (pipe(intr_pipe.pipefd) < 0) { + rte_errno = errno; + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +static void +eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) +{ + union rte_intr_read_buffer buf; + int bytes_read = 0; + int nbytes; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + bytes_read = intr_handle->efd_counter_size; + /* For vdev, number of bytes to read is set by driver */ + break; + case RTE_INTR_HANDLE_EXT: + return; + default: + bytes_read = 1; + RTE_LOG(INFO, EAL, "unexpected intr type\n"); + break; + } + + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + if (bytes_read == 0) + return; + do { + nbytes = read(fd, &buf, bytes_read); + if (nbytes < 0) { + if (errno == EINTR || errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + RTE_LOG(ERR, EAL, + "Error reading from fd %d: %s\n", + fd, strerror(errno)); + } else if (nbytes == 0) + RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); + return; + } while (1); +} + +static int +eal_epoll_process_event(struct epoll_event *evs, unsigned int n, + struct rte_epoll_event *events) +{ + unsigned int i, count = 0; + struct rte_epoll_event *rev; + + for (i = 0; i < n; i++) { + rev = evs[i].data.ptr; + if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, + RTE_EPOLL_EXEC)) + continue; + + events[count].status = RTE_EPOLL_VALID; + events[count].fd = rev->fd; + events[count].epfd = rev->epfd; + events[count].epdata.event = rev->epdata.event; + events[count].epdata.data = rev->epdata.data; + if (rev->epdata.cb_fun) + rev->epdata.cb_fun(rev->fd, + rev->epdata.cb_arg); + + rte_compiler_barrier(); + rev->status = RTE_EPOLL_VALID; + count++; + } + return count; +} + +static inline int +eal_init_tls_epfd(void) +{ + int pfd = epoll_create(255); + + if (pfd < 0) { + RTE_LOG(ERR, EAL, + "Cannot create epoll instance\n"); + return -1; + } + return pfd; +} + +int +rte_intr_tls_epfd(void) +{ + if (RTE_PER_LCORE(_epfd) == -1) + RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); + + return RTE_PER_LCORE(_epfd); +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + struct epoll_event evs[maxevents]; + int rc; + + if (!events) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + while (1) { + rc = epoll_wait(epfd, evs, maxevents, timeout); + if (likely(rc > 0)) { + /* epoll_wait has at least one fd ready to read */ + rc = eal_epoll_process_event(evs, rc, events); + break; + } else if (rc < 0) { + if (errno == EINTR) + continue; + /* epoll_wait fail */ + RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", + strerror(errno)); + rc = -1; + break; + } else { + /* rc == 0, epoll_wait timed out */ + break; + } + } + + return rc; +} + +static inline void +eal_epoll_data_safe_free(struct rte_epoll_event *ev) +{ + while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, + RTE_EPOLL_INVALID)) + while (ev->status != RTE_EPOLL_VALID) + rte_pause(); + memset(&ev->epdata, 0, sizeof(ev->epdata)); + ev->fd = -1; + ev->epfd = -1; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event) +{ + struct epoll_event ev; + + if (!event) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + if (op == EPOLL_CTL_ADD) { + event->status = RTE_EPOLL_VALID; + event->fd = fd; /* ignore fd in event */ + event->epfd = epfd; + ev.data.ptr = (void *)event; + } + + ev.events = event->epdata.event; + if (epoll_ctl(epfd, op, fd, &ev) < 0) { + RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", + op, fd, strerror(errno)); + if (op == EPOLL_CTL_ADD) + /* rollback status when CTL_ADD fail */ + event->status = RTE_EPOLL_INVALID; + return -1; + } + + if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) + eal_epoll_data_safe_free(event); + + return 0; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, + int op, unsigned int vec, void *data) +{ + struct rte_epoll_event *rev; + struct rte_epoll_data *epdata; + int epfd_op; + unsigned int efd_idx; + int rc = 0; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + if (!intr_handle || intr_handle->nb_efd == 0 || + efd_idx >= intr_handle->nb_efd) { + RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); + return -EPERM; + } + + switch (op) { + case RTE_INTR_EVENT_ADD: + epfd_op = EPOLL_CTL_ADD; + rev = &intr_handle->elist[efd_idx]; + if (rev->status != RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event already been added.\n"); + return -EEXIST; + } + + /* attach to intr vector fd */ + epdata = &rev->epdata; + epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; + epdata->data = data; + epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; + epdata->cb_arg = (void *)intr_handle; + rc = rte_epoll_ctl(epfd, epfd_op, + intr_handle->efds[efd_idx], rev); + if (!rc) + RTE_LOG(DEBUG, EAL, + "efd %d associated with vec %d added on epfd %d" + "\n", rev->fd, vec, epfd); + else + rc = -EPERM; + break; + case RTE_INTR_EVENT_DEL: + epfd_op = EPOLL_CTL_DEL; + rev = &intr_handle->elist[efd_idx]; + if (rev->status == RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event does not exist.\n"); + return -EPERM; + } + + rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); + if (rc) + rc = -EPERM; + break; + default: + RTE_LOG(ERR, EAL, "event op type mismatch\n"); + rc = -EPERM; + } + + return rc; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + struct rte_epoll_event *rev; + + for (i = 0; i < intr_handle->nb_efd; i++) { + rev = &intr_handle->elist[i]; + if (rev->status == RTE_EPOLL_INVALID) + continue; + if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { + /* force free if the entry valid */ + eal_epoll_data_safe_free(rev); + rev->status = RTE_EPOLL_INVALID; + } + } +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + uint32_t i; + int fd; + uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + assert(nb_efd != 0); + + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { + for (i = 0; i < n; i++) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, + "can't setup eventfd, error %i (%s)\n", + errno, strerror(errno)); + return -errno; + } + intr_handle->efds[i] = fd; + } + intr_handle->nb_efd = n; + intr_handle->max_intr = NB_OTHER_INTR + n; + } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { + /* only check, initialization would be done in vdev driver.*/ + if (intr_handle->efd_counter_size > + sizeof(union rte_intr_read_buffer)) { + RTE_LOG(ERR, EAL, "the efd_counter_size is oversized"); + return -EINVAL; + } + } else { + intr_handle->efds[0] = intr_handle->fd; + intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); + intr_handle->max_intr = NB_OTHER_INTR; + } + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + + rte_intr_free_epoll_fd(intr_handle); + if (intr_handle->max_intr > intr_handle->nb_efd) { + for (i = 0; i < intr_handle->nb_efd; i++) + close(intr_handle->efds[i]); + } + intr_handle->nb_efd = 0; + intr_handle->max_intr = 0; +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + return !(!intr_handle->nb_efd); +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + if (!rte_intr_dp_is_en(intr_handle)) + return 1; + else + return !!(intr_handle->max_intr - intr_handle->nb_efd); +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) + return 1; + + if (intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 1; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c new file mode 100644 index 00000000..bc896584 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_thread.h" + +#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" +#define CORE_ID_FILE "topology/core_id" +#define NUMA_NODE_PATH "/sys/devices/system/node" + +/* Check if a cpu is present by the presence of the cpu information for it */ +int +eal_cpu_detected(unsigned lcore_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), SYS_CPU_DIR + "/"CORE_ID_FILE, lcore_id); + if (len <= 0 || (unsigned)len >= sizeof(path)) + return 0; + if (access(path, F_OK) != 0) + return 0; + + return 1; +} + +/* + * Get CPU socket id (NUMA node) for a logical core. + * + * This searches each nodeX directories in /sys for the symlink for the given + * lcore_id and returns the numa node where the lcore is found. If lcore is not + * found on any numa node, returns zero. + */ +unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +/* Get the cpu core id value from the /sys/.../cpuX core_id value */ +unsigned +eal_cpu_core_id(unsigned lcore_id) +{ + char path[PATH_MAX]; + unsigned long id; + + int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); + if (len <= 0 || (unsigned)len >= sizeof(path)) + goto err; + if (eal_parse_sysfs_value(path, &id) != 0) + goto err; + return (unsigned)id; + +err: + RTE_LOG(ERR, EAL, "Error reading core id value from %s " + "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c new file mode 100644 index 00000000..9d02dddb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +/* + * default log function + */ +static ssize_t +console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) +{ + ssize_t ret; + + /* write on stdout */ + ret = fwrite(buf, 1, size, stdout); + fflush(stdout); + + /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ + syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); + + return ret; +} + +static cookie_io_functions_t console_log_func = { + .write = console_log_write, +}; + +/* + * set the log to default function, called during eal init process, + * once memzones are available. + */ +int +rte_eal_log_init(const char *id, int facility) +{ + FILE *log_stream; + + log_stream = fopencookie(NULL, "w+", console_log_func); + if (log_stream == NULL) + return -1; + + openlog(id, LOG_NDELAY | LOG_PID, facility); + + eal_log_set_default(log_stream); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c new file mode 100644 index 00000000..aa95551a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -0,0 +1,1363 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include +#include +#endif +#include +#include /* for hugetlb-related mmap flags */ + +#include +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "eal_private.h" + +const int anonymous_hugepages_supported = +#ifdef MAP_HUGE_SHIFT + 1; +#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT +#else + 0; +#define RTE_MAP_HUGE_SHIFT 26 +#endif + +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* for single-file segments, we need some kind of mechanism to keep track of + * which hugepages can be freed back to the system, and which cannot. we cannot + * use flock() because they don't allow locking parts of a file, and we cannot + * use fcntl() due to issues with their semantics, so we will have to rely on a + * bunch of lockfiles for each page. + * + * we cannot know how many pages a system will have in advance, but we do know + * that they come in lists, and we know lengths of these lists. so, simply store + * a malloc'd array of fd's indexed by list and segment index. + * + * they will be initialized at startup, and filled as we allocate/deallocate + * segments. also, use this to track memseg list proper fd. + */ +static struct { + int *fds; /**< dynamically allocated array of segment lock fd's */ + int memseg_list_fd; /**< memseg list fd */ + int len; /**< total length of the array */ + int count; /**< entries used in an array */ +} lock_fds[RTE_MAX_MEMSEG_LISTS]; + +/** local copy of a memory map, used to synchronize memory hotplug in MP */ +static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; + +static sigjmp_buf huge_jmpenv; + +static void __rte_unused huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int __rte_unused huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void __rte_unused +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void __rte_unused +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +static bool +check_numa(void) +{ + bool ret = true; + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + ret = false; + } + return ret; +} + +static void +prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) +{ + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + socket_id); + numa_set_preferred(socket_id); +} + +static void +restore_numa(int *oldpolicy, struct bitmask *oldmask) +{ + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", *oldpolicy); + if (*oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(*oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + numa_free_cpumask(oldmask); +} +#endif + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ +static int lock(int fd, int type) +{ + int ret; + + /* flock may be interrupted */ + do { + ret = flock(fd, type | LOCK_NB); + } while (ret && errno == EINTR); + + if (ret && errno == EWOULDBLOCK) { + /* couldn't lock */ + return 0; + } else if (ret) { + RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", + __func__, strerror(errno)); + return -1; + } + /* lock was successful */ + return 1; +} + +static int get_segment_lock_fd(int list_idx, int seg_idx) +{ + char path[PATH_MAX] = {0}; + int fd; + + if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + return -1; + if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + return -1; + + fd = lock_fds[list_idx].fds[seg_idx]; + /* does this lock already exist? */ + if (fd >= 0) + return fd; + + eal_get_hugefile_lock_path(path, sizeof(path), + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + + fd = open(path, O_CREAT | O_RDWR, 0660); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n", + __func__, path, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) != 1) { + RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n", + __func__, path, strerror(errno)); + close(fd); + return -1; + } + /* store it for future reference */ + lock_fds[list_idx].fds[seg_idx] = fd; + lock_fds[list_idx].count++; + return fd; +} + +static int unlock_segment(int list_idx, int seg_idx) +{ + int fd, ret; + + if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + return -1; + if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + return -1; + + fd = lock_fds[list_idx].fds[seg_idx]; + + /* upgrade lock to exclusive to see if we can remove the lockfile */ + ret = lock(fd, LOCK_EX); + if (ret == 1) { + /* we've succeeded in taking exclusive lock, this lockfile may + * be removed. + */ + char path[PATH_MAX] = {0}; + eal_get_hugefile_lock_path(path, sizeof(path), + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + if (unlink(path)) { + RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n", + __func__, path, strerror(errno)); + } + } + /* we don't want to leak the fd, so even if we fail to lock, close fd + * and remove it from list anyway. + */ + close(fd); + lock_fds[list_idx].fds[seg_idx] = -1; + lock_fds[list_idx].count--; + + if (ret < 0) + return -1; + return 0; +} + +static int +get_seg_fd(char *path, int buflen, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + int fd; + + if (internal_config.single_file_segments) { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); + + fd = lock_fds[list_idx].memseg_list_fd; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock and keep it indefinitely */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + lock_fds[list_idx].memseg_list_fd = fd; + } + } else { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + } + return fd; +} + +static int +resize_hugefile(int fd, char *path, int list_idx, int seg_idx, + uint64_t fa_offset, uint64_t page_sz, bool grow) +{ + bool again = false; + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, and hugepage file is already locked at + * creation, so no further synchronization needed. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret, lock_fd; + + /* if fallocate() is supported, we need to take out a + * read lock on allocate (to prevent other processes + * from deallocating this page), and take out a write + * lock on deallocate (to ensure nobody else is using + * this page). + * + * read locks on page itself are already taken out at + * file creation, in get_seg_fd(). + * + * we cannot rely on simple use of flock() call, because + * we need to be able to lock a section of the file, + * and we cannot use fcntl() locks, because of numerous + * problems with their semantics, so we will use + * deterministically named lock files for each section + * of the file. + * + * if we're shrinking the file, we want to upgrade our + * lock from shared to exclusive. + * + * lock_fd is an fd for a lockfile, not for the segment + * list. + */ + lock_fd = get_segment_lock_fd(list_idx, seg_idx); + + if (!grow) { + /* we are using this lockfile to determine + * whether this particular page is locked, as we + * are in single file segments mode and thus + * cannot use regular flock() to get this info. + * + * we want to try and take out an exclusive lock + * on the lock file to determine if we're the + * last ones using this page, and if not, we + * won't be shrinking it, and will instead exit + * prematurely. + */ + ret = lock(lock_fd, LOCK_EX); + + /* drop the lock on the lockfile, so that even + * if we couldn't shrink the file ourselves, we + * are signalling to other processes that we're + * no longer using this page. + */ + if (unlock_segment(list_idx, seg_idx)) + RTE_LOG(ERR, EAL, "Could not unlock segment\n"); + + /* additionally, if this was the last lock on + * this segment list, we can safely close the + * page file fd, so that one of the processes + * could then delete the file after shrinking. + */ + if (ret < 1 && lock_fds[list_idx].count == 0) { + close(fd); + lock_fds[list_idx].memseg_list_fd = -1; + } + + if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not lock segment\n"); + return -1; + } + if (ret == 0) + /* failed to lock, not an error. */ + return 0; + } + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else { + fallocate_supported = 1; + + /* we've grew/shrunk the file, and we hold an + * exclusive lock now. check if there are no + * more segments active in this segment list, + * and remove the file if there aren't. + */ + if (lock_fds[list_idx].count == 0) { + if (unlink(path)) + RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", + __func__, path, + strerror(errno)); + close(fd); + lock_fds[list_idx].memseg_list_fd = -1; + } + } + } + } while (again); + return 0; +} + +static int +alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, + struct hugepage_info *hi, unsigned int list_idx, + unsigned int seg_idx) +{ +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int cur_socket_id = 0; +#endif + uint64_t map_offset; + rte_iova_t iova; + void *va; + char path[PATH_MAX]; + int ret = 0; + int fd; + size_t alloc_sz; + int flags; + void *new_addr; + + alloc_sz = hi->hugepage_sz; + if (!internal_config.single_file_segments && + internal_config.in_memory && + anonymous_hugepages_supported) { + int log2, flags; + + log2 = rte_log2_u32(alloc_sz); + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED | + MAP_PRIVATE | MAP_ANONYMOUS; + fd = -1; + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0); + + /* single-file segments codepath will never be active because + * in-memory mode is incompatible with it and it's stopped at + * EAL initialization stage, however the compiler doesn't know + * that and complains about map_offset being used uninitialized + * on failure codepaths while having in-memory mode enabled. so, + * assign a value here. + */ + map_offset = 0; + } else { + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); + return -1; + } + + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, path, list_idx, seg_idx, + map_offset, alloc_sz, true); + if (ret < 0) + goto resized; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + if (internal_config.hugepage_unlink) { + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + } + } + + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, + map_offset); + } + + if (va == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, + strerror(errno)); + /* mmap failed, but the previous region might have been + * unmapped anyway. try to remap it + */ + goto unmapped; + } + if (va != addr) { + RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); + munmap(va, alloc_sz); + goto resized; + } + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", + (unsigned int)(alloc_sz >> 20)); + goto mapped; + } + + /* we need to trigger a write to the page to enforce page fault and + * ensure that page is accessible to us, but we can't overwrite value + * that is already there, so read the old value, and write itback. + * kernel populates the page with zeroes initially. + */ + *(volatile int *)addr = *(volatile int *)addr; + + iova = rte_mem_virt2iova(addr); + if (iova == RTE_BAD_PHYS_ADDR) { + RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", + __func__); + goto mapped; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0); + + if (cur_socket_id != socket_id) { + RTE_LOG(DEBUG, EAL, + "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", + __func__, socket_id, cur_socket_id); + goto mapped; + } +#endif + /* for non-single file segments that aren't in-memory, we can close fd + * here */ + if (!internal_config.single_file_segments && !internal_config.in_memory) + close(fd); + + ms->addr = addr; + ms->hugepage_sz = alloc_sz; + ms->len = alloc_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->iova = iova; + ms->socket_id = socket_id; + + return 0; + +mapped: + munmap(addr, alloc_sz); +unmapped: + flags = MAP_FIXED; +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); + if (new_addr != addr) { + if (new_addr != NULL) + munmap(new_addr, alloc_sz); + /* we're leaving a hole in our virtual address space. if + * somebody else maps this hole now, we could accidentally + * override it in the future. + */ + RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); + } +resized: + /* in-memory mode will never be single-file-segments mode */ + if (internal_config.single_file_segments) { + resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + alloc_sz, false); + /* ignore failure, can't make it any worse */ + } else { + /* only remove file if we can take out a write lock */ + if (internal_config.hugepage_unlink == 0 && + internal_config.in_memory == 0 && + lock(fd, LOCK_EX) == 1) + unlink(path); + close(fd); + } + return -1; +} + +static int +free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + uint64_t map_offset; + char path[PATH_MAX]; + int fd, ret; + + /* erase page data */ + memset(ms->addr, 0, ms->len); + + if (mmap(ms->addr, ms->len, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); + return -1; + } + + /* if we've already unlinked the page, nothing needs to be done */ + if (internal_config.hugepage_unlink) { + memset(ms, 0, sizeof(*ms)); + return 0; + } + + /* if we are not in single file segments mode, we're going to unmap the + * segment and thus drop the lock on original fd, but hugepage dir is + * now locked so we can take out another one without races. + */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + ms->len, false)) + return -1; + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } + /* closing fd will drop the lock */ + close(fd); + } + + memset(ms, 0, sizeof(*ms)); + + return ret < 0 ? -1 : 0; +} + +struct alloc_walk_param { + struct hugepage_info *hi; + struct rte_memseg **ms; + size_t page_sz; + unsigned int segs_allocated; + unsigned int n_segs; + int socket; + bool exact; +}; +static int +alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct alloc_walk_param *wa = arg; + struct rte_memseg_list *cur_msl; + size_t page_sz; + int cur_idx, start_idx, j, dir_fd = -1; + unsigned int msl_idx, need, i; + + if (msl->page_sz != wa->page_sz) + return 0; + if (msl->socket_id != wa->socket) + return 0; + + page_sz = (size_t)msl->page_sz; + + msl_idx = msl - mcfg->memsegs; + cur_msl = &mcfg->memsegs[msl_idx]; + + need = wa->n_segs; + + /* try finding space in memseg list */ + cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + for (i = 0; i < need; i++, cur_idx++) { + struct rte_memseg *cur; + void *map_addr; + + cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); + map_addr = RTE_PTR_ADD(cur_msl->base_va, + cur_idx * page_sz); + + if (alloc_seg(cur, map_addr, wa->socket, wa->hi, + msl_idx, cur_idx)) { + RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", + need, i); + + /* if exact number wasn't requested, stop */ + if (!wa->exact) + goto out; + + /* clean up */ + for (j = start_idx; j < cur_idx; j++) { + struct rte_memseg *tmp; + struct rte_fbarray *arr = + &cur_msl->memseg_arr; + + tmp = rte_fbarray_get(arr, j); + rte_fbarray_set_free(arr, j); + + /* free_seg may attempt to create a file, which + * may fail. + */ + if (free_seg(tmp, wa->hi, msl_idx, j)) + RTE_LOG(DEBUG, EAL, "Cannot free page\n"); + } + /* clear the list */ + if (wa->ms) + memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); + + if (dir_fd >= 0) + close(dir_fd); + return -1; + } + if (wa->ms) + wa->ms[i] = cur; + + rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); + } +out: + wa->segs_allocated = i; + if (i > 0) + cur_msl->version++; + if (dir_fd >= 0) + close(dir_fd); + return 1; +} + +struct free_walk_param { + struct hugepage_info *hi; + struct rte_memseg *ms; +}; +static int +free_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct free_walk_param *wa = arg; + uintptr_t start_addr, end_addr; + int msl_idx, seg_idx, ret, dir_fd = -1; + + start_addr = (uintptr_t) msl->base_va; + end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; + + if ((uintptr_t)wa->ms->addr < start_addr || + (uintptr_t)wa->ms->addr >= end_addr) + return 0; + + msl_idx = msl - mcfg->memsegs; + seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; + + /* msl is const */ + found_msl = &mcfg->memsegs[msl_idx]; + + /* do not allow any page allocations during the time we're freeing, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + found_msl->version++; + + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + + ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); + + if (dir_fd >= 0) + close(dir_fd); + + if (ret < 0) + return -1; + + return 1; +} + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact) +{ + int i, ret = -1; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + bool have_numa = false; + int oldpolicy; + struct bitmask *oldmask; +#endif + struct alloc_walk_param wa; + struct hugepage_info *hi = NULL; + + memset(&wa, 0, sizeof(wa)); + + /* dynamic allocation not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { + if (page_sz == + internal_config.hugepage_info[i].hugepage_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", + __func__); + return -1; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (check_numa()) { + oldmask = numa_allocate_nodemask(); + prepare_numa(&oldpolicy, oldmask, socket); + have_numa = true; + } +#endif + + wa.exact = exact; + wa.hi = hi; + wa.ms = ms; + wa.n_segs = n_segs; + wa.page_sz = page_sz; + wa.socket = socket; + wa.segs_allocated = 0; + + /* memalloc is locked, so it's safe to use thread-unsafe version */ + ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); + if (ret == 0) { + RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", + __func__); + ret = -1; + } else if (ret > 0) { + ret = (int)wa.segs_allocated; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (have_numa) + restore_numa(&oldpolicy, oldmask); +#endif + return ret; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket) +{ + struct rte_memseg *ms; + if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) + return NULL; + /* return pointer to newly allocated memseg */ + return ms; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) +{ + int seg, ret = 0; + + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (seg = 0; seg < n_segs; seg++) { + struct rte_memseg *cur = ms[seg]; + struct hugepage_info *hi = NULL; + struct free_walk_param wa; + int i, walk_res; + + /* if this page is marked as unfreeable, fail */ + if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); + ret = -1; + continue; + } + + memset(&wa, 0, sizeof(wa)); + + for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); + i++) { + hi = &internal_config.hugepage_info[i]; + if (cur->hugepage_sz == hi->hugepage_sz) + break; + } + if (i == (int)RTE_DIM(internal_config.hugepage_info)) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + ret = -1; + continue; + } + + wa.ms = cur; + wa.hi = hi; + + /* memalloc is locked, so it's safe to use thread-unsafe version + */ + walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, + &wa); + if (walk_res == 1) + continue; + if (walk_res == 0) + RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); + ret = -1; + } + return ret; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms) +{ + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + return eal_memalloc_free_seg_bulk(&ms, 1); +} + +static int +sync_chunk(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used, int start, int end) +{ + struct rte_fbarray *l_arr, *p_arr; + int i, ret, chunk_len, diff_len; + + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + /* we need to aggregate allocations/deallocations into bigger chunks, + * as we don't want to spam the user with per-page callbacks. + * + * to avoid any potential issues, we also want to trigger + * deallocation callbacks *before* we actually deallocate + * memory, so that the user application could wrap up its use + * before it goes away. + */ + + chunk_len = end - start; + + /* find how many contiguous pages we can map/unmap for this chunk */ + diff_len = used ? + rte_fbarray_find_contig_free(l_arr, start) : + rte_fbarray_find_contig_used(l_arr, start); + + /* has to be at least one page */ + if (diff_len < 1) + return -1; + + diff_len = RTE_MIN(chunk_len, diff_len); + + /* if we are freeing memory, notify the application */ + if (!used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + start_va, len); + } + + for (i = 0; i < diff_len; i++) { + struct rte_memseg *p_ms, *l_ms; + int seg_idx = start + i; + + l_ms = rte_fbarray_get(l_arr, seg_idx); + p_ms = rte_fbarray_get(p_arr, seg_idx); + + if (l_ms == NULL || p_ms == NULL) + return -1; + + if (used) { + ret = alloc_seg(l_ms, p_ms->addr, + p_ms->socket_id, hi, + msl_idx, seg_idx); + if (ret < 0) + return -1; + rte_fbarray_set_used(l_arr, seg_idx); + } else { + ret = free_seg(l_ms, hi, msl_idx, seg_idx); + rte_fbarray_set_free(l_arr, seg_idx); + if (ret < 0) + return -1; + } + } + + /* if we just allocated memory, notify the application */ + if (used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + start_va, len); + } + + /* calculate how much we can advance until next chunk */ + diff_len = used ? + rte_fbarray_find_contig_used(l_arr, start) : + rte_fbarray_find_contig_free(l_arr, start); + ret = RTE_MIN(chunk_len, diff_len); + + return ret; +} + +static int +sync_status(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used) +{ + struct rte_fbarray *l_arr, *p_arr; + int p_idx, l_chunk_len, p_chunk_len, ret; + int start, end; + + /* this is a little bit tricky, but the basic idea is - walk both lists + * and spot any places where there are discrepancies. walking both lists + * and noting discrepancies in a single go is a hard problem, so we do + * it in two passes - first we spot any places where allocated segments + * mismatch (i.e. ensure that everything that's allocated in the primary + * is also allocated in the secondary), and then we do it by looking at + * free segments instead. + * + * we also need to aggregate changes into chunks, as we have to call + * callbacks per allocation, not per page. + */ + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + if (used) + p_idx = rte_fbarray_find_next_used(p_arr, 0); + else + p_idx = rte_fbarray_find_next_free(p_arr, 0); + + while (p_idx >= 0) { + int next_chunk_search_idx; + + if (used) { + p_chunk_len = rte_fbarray_find_contig_used(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_used(l_arr, + p_idx); + } else { + p_chunk_len = rte_fbarray_find_contig_free(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_free(l_arr, + p_idx); + } + /* best case scenario - no differences (or bigger, which will be + * fixed during next iteration), look for next chunk + */ + if (l_chunk_len >= p_chunk_len) { + next_chunk_search_idx = p_idx + p_chunk_len; + goto next_chunk; + } + + /* if both chunks start at the same point, skip parts we know + * are identical, and sync the rest. each call to sync_chunk + * will only sync contiguous segments, so we need to call this + * until we are sure there are no more differences in this + * chunk. + */ + start = p_idx + l_chunk_len; + end = p_idx + p_chunk_len; + do { + ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, + used, start, end); + start += ret; + } while (start < end && ret >= 0); + /* if ret is negative, something went wrong */ + if (ret < 0) + return -1; + + next_chunk_search_idx = p_idx + p_chunk_len; +next_chunk: + /* skip to end of this chunk */ + if (used) { + p_idx = rte_fbarray_find_next_used(p_arr, + next_chunk_search_idx); + } else { + p_idx = rte_fbarray_find_next_free(p_arr, + next_chunk_search_idx); + } + } + return 0; +} + +static int +sync_existing(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx) +{ + int ret, dir_fd; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + */ + dir_fd = open(hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + + /* ensure all allocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); + if (ret < 0) + goto fail; + + /* ensure all unallocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); + if (ret < 0) + goto fail; + + /* update version number */ + local_msl->version = primary_msl->version; + + close(dir_fd); + + return 0; +fail: + close(dir_fd); + return -1; +} + +static int +sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + struct hugepage_info *hi = NULL; + unsigned int i; + int msl_idx; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + uint64_t cur_sz = + internal_config.hugepage_info[i].hugepage_sz; + uint64_t msl_sz = primary_msl->page_sz; + if (msl_sz == cur_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + return -1; + } + + /* if versions don't match, synchronize everything */ + if (local_msl->version != primary_msl->version && + sync_existing(primary_msl, local_msl, hi, msl_idx)) + return -1; + return 0; +} + + +int +eal_memalloc_sync_with_primary(void) +{ + /* nothing to be done in primary */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return 0; + + /* memalloc is locked, so it's safe to call thread-unsafe version */ + if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) + return -1; + return 0; +} + +static int +secondary_msl_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + char name[PATH_MAX]; + int msl_idx, ret; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + /* create distinct fbarrays for each secondary */ + snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", + primary_msl->memseg_arr.name, getpid()); + + ret = rte_fbarray_init(&local_msl->memseg_arr, name, + primary_msl->memseg_arr.len, + primary_msl->memseg_arr.elt_sz); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); + return -1; + } + local_msl->base_va = primary_msl->base_va; + + return 0; +} + +static int +secondary_lock_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i, len; + int msl_idx; + int *data; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + /* ensure we have space to store lock fd per each possible segment */ + data = malloc(sizeof(int) * len); + if (data == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n"); + return -1; + } + /* set all fd's as invalid */ + for (i = 0; i < len; i++) + data[i] = -1; + + lock_fds[msl_idx].fds = data; + lock_fds[msl_idx].len = len; + lock_fds[msl_idx].count = 0; + lock_fds[msl_idx].memseg_list_fd = -1; + + return 0; +} + +int +eal_memalloc_init(void) +{ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) + return -1; + + /* initialize all of the lock fd lists */ + if (internal_config.single_file_segments) + if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL)) + return -1; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c new file mode 100644 index 00000000..0bf2aef3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -0,0 +1,2348 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_memalloc.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" + +#define PFN_MASK_SIZE 8 + +/** + * @file + * Huge page mapping under linux + * + * To reserve a big contiguous amount of memory, we use the hugepage + * feature of linux. For that, we need to have hugetlbfs mounted. This + * code will create many files in this directory (one per page) and + * map them in virtual memory. For each page, we will retrieve its + * physical address and remap it in order to have a virtual contiguous + * zone as well as a physical contiguous zone. + */ + +static bool phys_addrs_available = true; + +#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" + +static void +test_phys_addrs_available(void) +{ + uint64_t tmp = 0; + phys_addr_t physaddr; + + if (!rte_eal_has_hugepages()) { + RTE_LOG(ERR, EAL, + "Started without hugepages support, physical addresses not available\n"); + phys_addrs_available = false; + return; + } + + physaddr = rte_mem_virt2phy(&tmp); + if (physaddr == RTE_BAD_PHYS_ADDR) { + if (rte_eal_iova_mode() == RTE_IOVA_PA) + RTE_LOG(ERR, EAL, + "Cannot obtain physical addresses: %s. " + "Only vfio will function.\n", + strerror(errno)); + phys_addrs_available = false; + } +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + int fd, retval; + uint64_t page, physaddr; + unsigned long virt_pfn; + int page_size; + off_t offset; + + /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ + if (!phys_addrs_available) + return RTE_BAD_IOVA; + + /* standard page size */ + page_size = getpagesize(); + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } + + virt_pfn = (unsigned long)virtaddr / page_size; + offset = sizeof(uint64_t) * virt_pfn; + if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { + RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_IOVA; + } + + retval = read(fd, &page, PFN_MASK_SIZE); + close(fd); + if (retval < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } else if (retval != PFN_MASK_SIZE) { + RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap " + "but expected %d:\n", + __func__, retval, PFN_MASK_SIZE); + return RTE_BAD_IOVA; + } + + /* + * the pfn (page frame number) are bits 0-54 (see + * pagemap.txt in linux Documentation) + */ + if ((page & 0x7fffffffffffffULL) == 0) + return RTE_BAD_IOVA; + + physaddr = ((page & 0x7fffffffffffffULL) * page_size) + + ((unsigned long)virtaddr % page_size); + + return physaddr; +} + +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t)virtaddr; + return rte_mem_virt2phy(virtaddr); +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value. We find + * it by browsing the /proc/self/pagemap special file. + */ +static int +find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); + if (addr == RTE_BAD_PHYS_ADDR) + return -1; + hugepg_tbl[i].physaddr = addr; + } + return 0; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. + */ +static int +set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + static phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + hugepg_tbl[i].physaddr = addr; + addr += hugepg_tbl[i].size; + } + return 0; +} + +/* + * Check whether address-space layout randomization is enabled in + * the kernel. This is important for multi-process as it can prevent + * two processes mapping data to the same virtual address + * Returns: + * 0 - address space randomization disabled + * 1/2 - address space randomization enabled + * negative error code on error + */ +static int +aslr_enabled(void) +{ + char c; + int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); + if (fd < 0) + return -errno; + retval = read(fd, &c, 1); + close(fd); + if (retval < 0) + return -errno; + if (retval == 0) + return -EIO; + switch (c) { + case '0' : return 0; + case '1' : return 1; + case '2' : return 2; + default: return -EINVAL; + } +} + +static sigjmp_buf huge_jmpenv; + +static void huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +/* Callback for numa library. */ +void numa_error(char *where) +{ + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); +} +#endif + +/* + * Mmap all hugepages of hugepage table: it first open a file in + * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the + * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored + * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to + * map contiguous physical blocks in contiguous virtual blocks. + */ +static unsigned +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, + uint64_t *essential_memory __rte_unused) +{ + int fd; + unsigned i; + void *virtaddr; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int node_id = -1; + int essential_prev = 0; + int oldpolicy; + struct bitmask *oldmask = numa_allocate_nodemask(); + bool have_numa = true; + unsigned long maxnode = 0; + + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + have_numa = false; + } + + if (have_numa) { + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + if (internal_config.socket_mem[i]) + maxnode = i + 1; + } +#endif + + for (i = 0; i < hpi->num_pages[0]; i++) { + struct hugepage_file *hf = &hugepg_tbl[i]; + uint64_t hugepage_sz = hpi->hugepage_sz; + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + unsigned int j; + + for (j = 0; j < maxnode; j++) + if (essential_memory[j]) + break; + + if (j == maxnode) { + node_id = (node_id + 1) % maxnode; + while (!internal_config.socket_mem[node_id]) { + node_id++; + node_id %= maxnode; + } + essential_prev = 0; + } else { + node_id = j; + essential_prev = essential_memory[j]; + + if (essential_memory[j] < hugepage_sz) + essential_memory[j] = 0; + else + essential_memory[j] -= hugepage_sz; + } + + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + node_id); + numa_set_preferred(node_id); + } +#endif + + hf->file_id = i; + hf->size = hugepage_sz; + eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), + hpi->hugedir, hf->file_id); + hf->filepath[sizeof(hf->filepath) - 1] = '\0'; + + /* try to create hugepage file */ + fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + goto out; + } + + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros. we don't care where + * this gets mapped - we already have contiguous memory areas + * ready for us to map into. + */ + virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (virtaddr == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, + strerror(errno)); + close(fd); + goto out; + } + + hf->orig_va = virtaddr; + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned int)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) + essential_memory[node_id] = + essential_prev; +#endif + goto out; + } + *(int *)virtaddr = 0; + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + goto out; + } + + close(fd); + } + +out: +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", oldpolicy); + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + } + numa_free_cpumask(oldmask); +#endif + return i; +} + +/* + * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge + * page. + */ +static int +find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int socket_id; + char *end, *nodestr; + unsigned i, hp_count = 0; + uint64_t virt_addr; + char buf[BUFSIZ]; + char hugedir_str[PATH_MAX]; + FILE *f; + + f = fopen("/proc/self/numa_maps", "r"); + if (f == NULL) { + RTE_LOG(NOTICE, EAL, "NUMA support not available" + " consider that all memory is in socket_id 0\n"); + return 0; + } + + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, internal_config.hugefile_prefix); + + /* parse numa map */ + while (fgets(buf, sizeof(buf), f) != NULL) { + + /* ignore non huge page */ + if (strstr(buf, " huge ") == NULL && + strstr(buf, hugedir_str) == NULL) + continue; + + /* get zone addr */ + virt_addr = strtoull(buf, &end, 16); + if (virt_addr == 0 || end == buf) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* get node id (socket id) */ + nodestr = strstr(buf, " N"); + if (nodestr == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + nodestr += 2; + end = strstr(nodestr, "="); + if (end == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + end[0] = '\0'; + end = NULL; + + socket_id = strtoul(nodestr, &end, 0); + if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* if we find this page in our mappings, set socket_id */ + for (i = 0; i < hpi->num_pages[0]; i++) { + void *va = (void *)(unsigned long)virt_addr; + if (hugepg_tbl[i].orig_va == va) { + hugepg_tbl[i].socket_id = socket_id; + hp_count++; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + RTE_LOG(DEBUG, EAL, + "Hugepage %s is on socket %d\n", + hugepg_tbl[i].filepath, socket_id); +#endif + } + } + } + + if (hp_count < hpi->num_pages[0]) + goto error; + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +cmp_physaddr(const void *a, const void *b) +{ +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = a; + const struct hugepage_file *p2 = b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = b; + const struct hugepage_file *p2 = a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; +} + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + void *retval; + int fd; + + /* if no shared files mode is used, create anonymous memory instead */ + if (internal_config.no_shconf) { + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (retval == MAP_FAILED) + return NULL; + return retval; + } + + fd = open(filename, O_CREAT | O_RDWR, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (retval == MAP_FAILED) + return NULL; + return retval; +} + +/* + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, + const struct hugepage_file * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].orig_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); + dst_pos++; + } + } + return 0; +} + +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->orig_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + uint64_t unmap_len; + + unmap_len = hp->size; + + /* get start addr and len of the remaining segment */ + munmap(hp->orig_va, + (size_t)unmap_len); + + hp->orig_va = NULL; + if (unlink(hp->filepath) == -1) { + RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + return -1; + } + } else { + /* lock the page and skip */ + pages_found++; + } + + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static int +remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int cur_page, seg_len; + unsigned int msl_idx; + int ms_idx; + uint64_t page_sz; + size_t memseg_len; + int socket_id; + + page_sz = hugepages[seg_start].size; + socket_id = hugepages[seg_start].socket_id; + seg_len = seg_end - seg_start; + + RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20ULL, socket_id); + + /* find free space in memseg lists */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + bool empty; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + if (msl->socket_id != socket_id) + continue; + + /* leave space for a hole if array is not empty */ + empty = arr->count == 0; + ms_idx = rte_fbarray_find_next_n_free(arr, 0, + seg_len + (empty ? 0 : 1)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + /* leave some space between memsegs, they are not IOVA + * contiguous, so they shouldn't be VA contiguous either. + */ + if (!empty) + ms_idx++; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + +#ifdef RTE_ARCH_PPC64 + /* for PPC64 we go through the list backwards */ + for (cur_page = seg_end - 1; cur_page >= seg_start; + cur_page--, ms_idx++) { +#else + for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { +#endif + struct hugepage_file *hfile = &hugepages[cur_page]; + struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); + void *addr; + int fd; + + fd = open(hfile->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", + hfile->filepath, strerror(errno)); + return -1; + } + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + memseg_len = (size_t)page_sz; + addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); + + /* we know this address is already mmapped by memseg list, so + * using MAP_FIXED here is safe + */ + addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + + /* we have a new address, so unmap previous one */ +#ifndef RTE_ARCH_64 + /* in 32-bit legacy mode, we have already unmapped the page */ + if (!internal_config.legacy_mem) + munmap(hfile->orig_va, page_sz); +#else + munmap(hfile->orig_va, page_sz); +#endif + + hfile->orig_va = NULL; + hfile->final_va = addr; + + /* rewrite physical addresses in IOVA as VA mode */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + hfile->physaddr = (uintptr_t)addr; + + /* set up memseg data */ + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = memseg_len; + ms->iova = hfile->physaddr; + ms->socket_id = hfile->socket_id; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + + rte_fbarray_set_used(arr, ms_idx); + + close(fd); + } + RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20, socket_id); + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +static int +free_memseg_list(struct rte_memseg_list *msl) +{ + if (rte_fbarray_destroy(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); + return -1; + } + memset(msl, 0, sizeof(*msl)); + return 0; +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + + return 0; +} + +/* + * Our VA space is not preallocated yet, so preallocate it here. We need to know + * how many segments there are in order to map all pages into one address space, + * and leave appropriate holes between segments so that rte_malloc does not + * concatenate them into one big segment. + * + * we also need to unmap original pages to free up address space. + */ +static int __rte_unused +prealloc_segments(struct hugepage_file *hugepages, int n_pages) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int cur_page, seg_start_page, end_seg, new_memseg; + unsigned int hpi_idx, socket, i; + int n_contig_segs, n_segs; + int msl_idx; + + /* before we preallocate segments, we need to free up our VA space. + * we're not removing files, and we already have information about + * PA-contiguousness, so it is safe to unmap everything. + */ + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *hpi = &hugepages[cur_page]; + munmap(hpi->orig_va, hpi->size); + hpi->orig_va = NULL; + } + + /* we cannot know how many page sizes and sockets we have discovered, so + * loop over all of them + */ + for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t page_sz = + internal_config.hugepage_info[hpi_idx].hugepage_sz; + + for (i = 0; i < rte_socket_count(); i++) { + struct rte_memseg_list *msl; + + socket = rte_socket_id_by_idx(i); + n_contig_segs = 0; + n_segs = 0; + seg_start_page = -1; + + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + int prev_seg_start_page = -1; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : + &hugepages[cur_page - 1]; + + new_memseg = 0; + end_seg = 0; + + if (cur->size == 0) + end_seg = 1; + else if (cur->socket_id != (int) socket) + end_seg = 1; + else if (cur->size != page_sz) + end_seg = 1; + else if (cur_page == 0) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start + * from higher address to lower address. Here, + * physical addresses are in descending order. + */ + else if ((prev->physaddr - cur->physaddr) != + cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != + cur->size) + new_memseg = 1; +#endif + if (new_memseg) { + /* if we're already inside a segment, + * new segment means end of current one + */ + if (seg_start_page != -1) { + end_seg = 1; + prev_seg_start_page = + seg_start_page; + } + seg_start_page = cur_page; + } + + if (end_seg) { + if (prev_seg_start_page != -1) { + /* we've found a new segment */ + n_contig_segs++; + n_segs += cur_page - + prev_seg_start_page; + } else if (seg_start_page != -1) { + /* we didn't find new segment, + * but did end current one + */ + n_contig_segs++; + n_segs += cur_page - + seg_start_page; + seg_start_page = -1; + continue; + } else { + /* we're skipping this page */ + continue; + } + } + /* segment continues */ + } + /* check if we missed last segment */ + if (seg_start_page != -1) { + n_contig_segs++; + n_segs += cur_page - seg_start_page; + } + + /* if no segments were found, do not preallocate */ + if (n_segs == 0) + continue; + + /* we now have total number of pages that we will + * allocate for this segment list. add separator pages + * to the total count, and preallocate VA space. + */ + n_segs += n_contig_segs - 1; + + /* now, preallocate VA space for these segments */ + + /* first, find suitable memseg list for this */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + msl = &mcfg->memsegs[msl_idx]; + + if (msl->base_va != NULL) + continue; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + /* now, allocate fbarray itself */ + if (alloc_memseg_list(msl, page_sz, n_segs, socket, + msl_idx) < 0) + return -1; + + /* finally, allocate VA space */ + if (alloc_va_space(msl) < 0) + return -1; + } + } + return 0; +} + +/* + * We cannot reallocate memseg lists on the fly because PPC64 stores pages + * backwards, therefore we have to process the entire memseg first before + * remapping it into memseg list VA space. + */ +static int +remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) +{ + int cur_page, seg_start_page, new_memseg, ret; + + seg_start_page = 0; + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + + new_memseg = 0; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; + + /* if size is zero, no more pages left */ + if (cur->size == 0) + break; + + if (cur_page == 0) + new_memseg = 1; + else if (cur->socket_id != prev->socket_id) + new_memseg = 1; + else if (cur->size != prev->size) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * address to lower address. Here, physical addresses are in + * descending order. + */ + else if ((prev->physaddr - cur->physaddr) != cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != cur->size) + new_memseg = 1; +#endif + + if (new_memseg) { + /* if this isn't the first time, remap segment */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + /* remember where we started */ + seg_start_page = cur_page; + } + /* continuation of previous memseg */ + } + /* we were stopped, but we didn't remap the last segment, do it now */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + return 0; +} + +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return size; +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the + * number of pages thereof, and calculates the best number of + * pages of each size to fulfill the request for ram + */ +static int +calc_num_pages_per_socket(uint64_t * memory, + struct hugepage_info *hp_info, + struct hugepage_info *hp_used, + unsigned num_hp_info) +{ + unsigned socket, j, i = 0; + unsigned requested, available; + int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + + if (num_hp_info == 0) + return -1; + + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + size_t total_size; +#ifdef RTE_ARCH_64 + int cpu_per_socket[RTE_MAX_NUMA_NODES]; + size_t default_size; + unsigned lcore_id; + + /* Compute number of cores per socket */ + memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); + RTE_LCORE_FOREACH(lcore_id) { + cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; + } + + /* + * Automatically spread requested memory amongst detected sockets according + * to number of cores from cpu mask present on each socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + + /* Set memory amount per socket */ + default_size = (internal_config.memory * cpu_per_socket[socket]) + / rte_lcore_count(); + + /* Limit to maximum available memory on socket */ + default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); + + /* Update sizes */ + memory[socket] = default_size; + total_size -= default_size; + } + + /* + * If some memory is remaining, try to allocate it by getting all + * available memory from sockets, one after the other + */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + /* take whatever is available */ + default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], + total_size); + + /* Update sizes */ + memory[socket] += default_size; + total_size -= default_size; + } +#else + /* in 32-bit mode, allocate all of the memory only on master + * lcore socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; + socket++) { + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int master_lcore_socket; + + master_lcore_socket = + rte_lcore_to_socket_id(cfg->master_lcore); + + if (master_lcore_socket != socket) + continue; + + /* Update sizes */ + memory[socket] = total_size; + break; + } +#endif + } + + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, + sizeof(hp_used[i].hugedir)); + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0 && + internal_config.socket_mem[socket] != 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; + } + } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } + return total_num_pages; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +/* + * Prepare physical memory mapping: fill configuration structure with + * these infos, return 0 on success. + * 1. map N huge pages in separate files in hugetlbfs + * 2. find associated physical addr + * 3. find associated NUMA socket ID + * 4. sort all huge pages by physical address + * 5. remap these N huge pages in the correct order + * 6. unmap the first mapping + * 7. fill memsegs in configuration with contiguous zones + */ +static int +eal_legacy_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + struct rte_fbarray *arr; + struct rte_memseg *ms; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; + int i, j; + int nr_hugefiles, nr_hugepages = 0; + void *addr; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + struct rte_memseg_list *msl; + uint64_t page_sz; + int n_segs, cur_seg; + + /* nohuge mode is legacy mode */ + internal_config.legacy_mem = 1; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; + + /* populate memsegs. each memseg is one page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->socket_id = 0; + ms->len = page_sz; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, (size_t)page_sz); + } + return 0; + } + + /* allocate single hugetlbfs file on the master numa node */ + if (internal_config.single_file_segments) { + struct hugepage_info *hpi = NULL; + struct rte_memseg_list *msl; + size_t vma_len; + int n_segs, cur_seg; + char filepath[PATH_MAX]; + unsigned node_id = 0; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int oldpolicy; + struct bitmask *oldmask = numa_allocate_nodemask(); + bool have_numa = true; + + node_id = rte_lcore_to_socket_id(rte_get_master_lcore()); + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + have_numa = false; + } else { + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + node_id); + numa_set_preferred(node_id); + } +#endif + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + + /* choose optimal hugetlbfs for the mapping */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + hpi = &internal_config.hugepage_info[i]; + if (hpi->hugepage_sz > internal_config.memory || + hpi->num_pages[0] * hpi->hugepage_sz < + internal_config.memory) + hpi = NULL; + } + + if (hpi == NULL) { + RTE_LOG(ERR, EAL, + "Cannot find a single hugetlbfs with %"PRIu64" MB free mem.\n", + internal_config.memory); + return -1; + } + + eal_get_hugefile_path(filepath, sizeof(filepath), hpi->hugedir, 0); + filepath[sizeof(filepath) - 1] = '\0'; + + /* try to create hugepage file */ + int fd = open(filepath, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return -1; + } + + /* length needs to be manually aligned for future munmap */ + vma_len = RTE_ALIGN_CEIL(internal_config.memory, hpi->hugepage_sz); + addr = eal_get_virtual_area(NULL, &vma_len, hpi->hugepage_sz, 0, 0); + if (addr == NULL) { + RTE_LOG(ERR, EAL, + "Cannot reserve virtually-contiguous %"PRIu64" MB.\n", + internal_config.memory); + return -1; + } + + addr = mmap(addr, vma_len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (have_numa) { + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", oldpolicy); + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + } + numa_free_cpumask(oldmask); +#endif + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + n_segs = vma_len / hpi->hugepage_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "singlefileseg", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + msl->base_va = addr; + msl->page_sz = hpi->hugepage_sz; + msl->socket_id = node_id; + + /* populate memsegs. each memseg is one page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = hpi->hugepage_sz; + ms->socket_id = node_id; + ms->len = hpi->hugepage_sz; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, (size_t)hpi->hugepage_sz); + } + + return 0; + } + + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); + + hp_offset = 0; /* where we start the current page size entries */ + + huge_register_sigbus(); + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* map all hugepages and sort them */ + for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + unsigned pages_old, pages_new; + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + + if (hpi->num_pages[0] == 0) + continue; + + /* map all hugepages available */ + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + + int pages = pages_old - pages_new; + + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; + } + + if (phys_addrs_available && + rte_eal_iova_mode() != RTE_IOVA_VA) { + /* find physical addresses for each hugepage */ + if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to find phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } else { + /* set physical addresses for each hugepage */ + if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to set phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } + + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += hpi->num_pages[0]; + } + + huge_recover_sigbus(); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + + nr_hugefiles = nr_hugepages; + + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < nr_hugefiles; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { + internal_config.hugepage_info[j].num_pages[socket]++; + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nr_hugepages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nr_hugepages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_data_path(), + nr_hugefiles * sizeof(struct hugepage_file)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; + } + memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); + + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have orig_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, + tmp_hp, nr_hugefiles) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + +#ifndef RTE_ARCH_64 + /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ + if (internal_config.legacy_mem && + prealloc_segments(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); + goto fail; + } +#endif + + /* remap all pages we do need into memseg list VA space, so that those + * pages become first-class citizens in DPDK memory subsystem + */ + if (remap_needed_hugepages(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); + goto fail; + } + + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + /* we're not going to allocate more pages, so release VA space for + * unused memseg lists + */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + size_t mem_sz; + + /* skip inactive lists */ + if (msl->base_va == NULL) + continue; + /* skip lists where there is at least one page allocated */ + if (msl->memseg_arr.count > 0) + continue; + /* this is an unused list, deallocate it */ + mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len; + munmap(msl->base_va, mem_sz); + msl->base_va = NULL; + + /* destroy backing fbarray */ + rte_fbarray_destroy(&msl->memseg_arr); + } + + return 0; + +fail: + huge_recover_sigbus(); + free(tmp_hp); + if (hugepage != NULL) + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + return -1; +} + +static int __rte_unused +hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct hugepage_info *hpi = arg; + + if (msl->page_sz != hpi->hugepage_sz) + return 0; + + hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; + return 0; +} + +static int +limits_callback(int socket_id, size_t cur_limit, size_t new_len) +{ + RTE_SET_USED(socket_id); + RTE_SET_USED(cur_limit); + RTE_SET_USED(new_len); + return -1; +} + +static int +eal_hugepage_init(void) +{ + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + uint64_t memory[RTE_MAX_NUMA_NODES]; + int hp_sz_idx, socket_id; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { +#ifndef RTE_ARCH_64 + struct hugepage_info dummy; + unsigned int i; +#endif + /* also initialize used_hp hugepage sizes in used_hp */ + struct hugepage_info *hpi; + hpi = &internal_config.hugepage_info[hp_sz_idx]; + used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; + +#ifndef RTE_ARCH_64 + /* for 32-bit, limit number of pages on socket to whatever we've + * preallocated, as we cannot allocate more. + */ + memset(&dummy, 0, sizeof(dummy)); + dummy.hugepage_sz = hpi->hugepage_sz; + if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) + return -1; + + for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { + hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], + dummy.num_pages[i]); + } +#endif + } + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) + memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; + + /* calculate final number of pages */ + if (calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes) < 0) + return -1; + + for (hp_sz_idx = 0; + hp_sz_idx < (int)internal_config.num_hugepage_sizes; + hp_sz_idx++) { + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; + socket_id++) { + struct rte_memseg **pages; + struct hugepage_info *hpi = &used_hp[hp_sz_idx]; + unsigned int num_pages = hpi->num_pages[socket_id]; + int num_pages_alloc, i; + + if (num_pages == 0) + continue; + + pages = malloc(sizeof(*pages) * num_pages); + + RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", + num_pages, hpi->hugepage_sz >> 20, socket_id); + + num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages, + num_pages, hpi->hugepage_sz, + socket_id, true); + if (num_pages_alloc < 0) { + free(pages); + return -1; + } + + /* mark preallocated pages as unfreeable */ + for (i = 0; i < num_pages_alloc; i++) { + struct rte_memseg *ms = pages[i]; + ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + } + free(pages); + } + } + /* if socket limits were specified, set them */ + if (internal_config.force_socket_limits) { + unsigned int i; + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { + uint64_t limit = internal_config.socket_limit[i]; + if (limit == 0) + continue; + if (rte_mem_alloc_validator_register("socket-limit", + limits_callback, i, limit)) + RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); + } + } + return 0; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration and finds the hugepages which form that segment, mapping them + * in order to form a contiguous block in the virtual memory space + */ +static int +eal_legacy_hugepage_attach(void) +{ + struct hugepage_file *hp = NULL; + unsigned int num_hp = 0; + unsigned int i = 0; + unsigned int cur_seg; + off_t size = 0; + int fd, fd_hugepage = -1; + + if (aslr_enabled() > 0) { + RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " + "(ASLR) is enabled in the kernel.\n"); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " + "into secondary processes\n"); + } + + test_phys_addrs_available(); + + fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + eal_hugepage_data_path()); + goto error; + } + + size = getFileSize(fd_hugepage); + hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); + if (hp == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + eal_hugepage_data_path()); + goto error; + } + + num_hp = size / sizeof(struct hugepage_file); + RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); + + /* map all segments into memory to make sure we get the addrs. the + * segments themselves are already in memseg list (which is shared and + * has its VA space already preallocated), so we just need to map + * everything into correct addresses. + */ + for (i = 0; i < num_hp; i++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + + /* if size is zero, no more pages left */ + if (map_sz == 0) + break; + + fd = open(hf->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", + hf->filepath, strerror(errno)); + goto error; + } + + map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not map %s: %s\n", + hf->filepath, strerror(errno)); + close(fd); + goto error; + } + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", + __func__, strerror(errno)); + close(fd); + goto error; + } + + close(fd); + } + /* unmap the hugepage config file, since we are done using it */ + munmap(hp, size); + close(fd_hugepage); + return 0; + +error: + /* map all segments into memory to make sure we get the addrs */ + cur_seg = 0; + for (cur_seg = 0; cur_seg < i; cur_seg++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + + munmap(map_addr, map_sz); + } + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +static int +eal_hugepage_attach(void) +{ + if (eal_memalloc_sync_with_primary()) { + RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); + if (aslr_enabled() > 0) + RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); + return -1; + } + return 0; +} + +int +rte_eal_hugepage_init(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_init() : + eal_hugepage_init(); +} + +int +rte_eal_hugepage_attach(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_attach() : + eal_hugepage_attach(); +} + +int +rte_eal_using_phys_addrs(void) +{ + return phys_addrs_available; +} + +static int __rte_unused +memseg_primary_init_32(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int active_sockets, hpi_idx, msl_idx = 0; + unsigned int socket_id, i; + struct rte_memseg_list *msl; + uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; + uint64_t max_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* this is a giant hack, but desperate times call for desperate + * measures. in legacy 32-bit mode, we cannot preallocate VA space, + * because having upwards of 2 gigabytes of VA space already mapped will + * interfere with our ability to map and sort hugepages. + * + * therefore, in legacy 32-bit mode, we will be initializing memseg + * lists much later - in eal_memory.c, right after we unmap all the + * unneeded pages. this will not affect secondary processes, as those + * should be able to mmap the space without (too many) problems. + */ + if (internal_config.legacy_mem) + return 0; + + /* 32-bit mode is a very special case. we cannot know in advance where + * the user will want to allocate their memory, so we have to do some + * heuristics. + */ + active_sockets = 0; + total_requested_mem = 0; + if (internal_config.force_sockets) + for (i = 0; i < rte_socket_count(); i++) { + uint64_t mem; + + socket_id = rte_socket_id_by_idx(i); + mem = internal_config.socket_mem[socket_id]; + + if (mem == 0) + continue; + + active_sockets++; + total_requested_mem += mem; + } + else + total_requested_mem = internal_config.memory; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + if (total_requested_mem > max_mem) { + RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", + (unsigned int)(max_mem >> 20)); + return -1; + } + total_extra_mem = max_mem - total_requested_mem; + extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : + total_extra_mem / active_sockets; + + /* the allocation logic is a little bit convoluted, but here's how it + * works, in a nutshell: + * - if user hasn't specified on which sockets to allocate memory via + * --socket-mem, we allocate all of our memory on master core socket. + * - if user has specified sockets to allocate memory on, there may be + * some "unused" memory left (e.g. if user has specified --socket-mem + * such that not all memory adds up to 2 gigabytes), so add it to all + * sockets that are in use equally. + * + * page sizes are sorted by size in descending order, so we can safely + * assume that we dispense with bigger page sizes first. + */ + + /* create memseg lists */ + for (i = 0; i < rte_socket_count(); i++) { + int hp_sizes = (int) internal_config.num_hugepage_sizes; + uint64_t max_socket_mem, cur_socket_mem; + unsigned int master_lcore_socket; + struct rte_config *cfg = rte_eal_get_configuration(); + bool skip; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + /* if we didn't specifically request memory on this socket */ + skip = active_sockets != 0 && + internal_config.socket_mem[socket_id] == 0; + /* ...or if we didn't specifically request memory on *any* + * socket, and this is not master lcore + */ + master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); + skip |= active_sockets == 0 && socket_id != master_lcore_socket; + + if (skip) { + RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", + socket_id); + continue; + } + + /* max amount of memory on this socket */ + max_socket_mem = (active_sockets != 0 ? + internal_config.socket_mem[socket_id] : + internal_config.memory) + + extra_mem_per_socket; + cur_socket_mem = 0; + + for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { + uint64_t max_pagesz_mem, cur_pagesz_mem = 0; + uint64_t hugepage_sz; + struct hugepage_info *hpi; + int type_msl_idx, max_segs, total_segs = 0; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* check if pages are actually available */ + if (hpi->num_pages[socket_id] == 0) + continue; + + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + max_pagesz_mem = max_socket_mem - cur_socket_mem; + + /* make it multiple of page size */ + max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, + hugepage_sz); + + RTE_LOG(DEBUG, EAL, "Attempting to preallocate " + "%" PRIu64 "M on socket %i\n", + max_pagesz_mem >> 20, socket_id); + + type_msl_idx = 0; + while (cur_pagesz_mem < max_pagesz_mem && + total_segs < max_segs) { + uint64_t cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx]; + + cur_mem = get_mem_amount(hugepage_sz, + max_pagesz_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) { + /* failing to allocate a memseg list is + * a serious error. + */ + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + if (alloc_va_space(msl)) { + /* if we couldn't allocate VA space, we + * can try with smaller page sizes. + */ + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); + /* deallocate memseg list */ + if (free_memseg_list(msl)) + return -1; + break; + } + + total_segs += msl->memseg_arr.len; + cur_pagesz_mem = total_segs * hugepage_sz; + type_msl_idx++; + msl_idx++; + } + cur_socket_mem += cur_pagesz_mem; + } + if (cur_socket_mem == 0) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", + socket_id); + return -1; + } + } + + return 0; +} + +static int __rte_unused +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, socket_id, hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + for (i = 0; i < (int) rte_socket_count(); i++) { + uint64_t max_type_mem, total_type_mem = 0; + int type_msl_idx, max_segs, total_segs = 0; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + if (total_mem >= max_mem) + break; + + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem, cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + + cur_mem = get_mem_amount(hugepage_sz, + cur_max_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + } + return 0; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + return rte_eal_process_type() == RTE_PROC_PRIMARY ? +#ifndef RTE_ARCH_64 + memseg_primary_init_32() : +#else + memseg_primary_init() : +#endif + memseg_secondary_init(); +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c new file mode 100644 index 00000000..b496fc71 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current EAL thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", + lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + + /* when a service core returns, it should go directly to WAIT + * state, because the application will not lcore_wait() for it. + */ + if (lcore_config[lcore_id].core_role == ROLE_SERVICE) + lcore_config[lcore_id].state = WAIT; + else + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + return (int)syscall(SYS_gettid); +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + int ret = ENOSYS; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_setname_np(id, name); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + return -ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c new file mode 100644 index 00000000..2766bd78 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2012-2013 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +enum timer_source eal_timer_source = EAL_TIMER_HPET; + +#ifdef RTE_LIBEAL_USE_HPET + +#define DEV_HPET "/dev/hpet" + +/* Maximum number of counters. */ +#define HPET_TIMER_NUM 3 + +/* General capabilities register */ +#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ +#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ + +/** + * HPET timer registers. From the Intel IA-PC HPET (High Precision Event + * Timers) Specification. + */ +struct eal_hpet_regs { + /* Memory-mapped, software visible registers */ + uint64_t capabilities; /**< RO General Capabilities Register. */ + uint64_t reserved0; /**< Reserved for future use. */ + uint64_t config; /**< RW General Configuration Register. */ + uint64_t reserved1; /**< Reserved for future use. */ + uint64_t isr; /**< RW Clear General Interrupt Status. */ + uint64_t reserved2[25]; /**< Reserved for future use. */ + union { + uint64_t counter; /**< RW Main Counter Value Register. */ + struct { + uint32_t counter_l; /**< RW Main Counter Low. */ + uint32_t counter_h; /**< RW Main Counter High. */ + }; + }; + uint64_t reserved3; /**< Reserved for future use. */ + struct { + uint64_t config; /**< RW Timer Config and Capability Reg. */ + uint64_t comp; /**< RW Timer Comparator Value Register. */ + uint64_t fsb; /**< RW FSB Interrupt Route Register. */ + uint64_t reserved4; /**< Reserved for future use. */ + } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ +}; + +/* Mmap'd hpet registers */ +static volatile struct eal_hpet_regs *eal_hpet = NULL; + +/* Period at which the HPET counter increments in + * femtoseconds (10^-15 seconds). */ +static uint32_t eal_hpet_resolution_fs = 0; + +/* Frequency of the HPET counter in Hz */ +static uint64_t eal_hpet_resolution_hz = 0; + +/* Incremented 4 times during one 32bits hpet full count */ +static uint32_t eal_hpet_msb; + +static pthread_t msb_inc_thread_id; + +/* + * This function runs on a specific thread to update a global variable + * containing used to process MSB of the HPET (unfortunately, we need + * this because hpet is 32 bits by default under linux). + */ +static void +hpet_msb_inc(__attribute__((unused)) void *arg) +{ + uint32_t t; + + while (1) { + t = (eal_hpet->counter_l >> 30); + if (t != (eal_hpet_msb & 3)) + eal_hpet_msb ++; + sleep(10); + } +} + +uint64_t +rte_get_hpet_hz(void) +{ + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + return eal_hpet_resolution_hz; +} + +uint64_t +rte_get_hpet_cycles(void) +{ + uint32_t t, msb; + uint64_t ret; + + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + t = eal_hpet->counter_l; + msb = eal_hpet_msb; + ret = (msb + 2 - (t >> 30)) / 4; + ret <<= 32; + ret += t; + return ret; +} + +#endif + +#ifdef RTE_LIBEAL_USE_HPET +/* + * Open and mmap /dev/hpet (high precision event timer) that will + * provide our time reference. + */ +int +rte_eal_hpet_init(int make_default) +{ + int fd, ret; + + if (internal_config.no_hpet) { + RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); + return -1; + } + + fd = open(DEV_HPET, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", + strerror(errno)); + internal_config.no_hpet = 1; + return -1; + } + eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); + if (eal_hpet == MAP_FAILED) { + RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" + "Please enable CONFIG_HPET_MMAP in your kernel configuration " + "to allow HPET support.\n" + "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " + "in your build configuration or use '--no-hpet' EAL flag.\n"); + close(fd); + internal_config.no_hpet = 1; + return -1; + } + close(fd); + + eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & + CLK_PERIOD_MASK) >> + CLK_PERIOD_SHIFT); + + eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / + (uint64_t)eal_hpet_resolution_fs; + + RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", + eal_hpet_resolution_hz/1000); + + eal_hpet_msb = (eal_hpet->counter_l >> 30); + + /* create a thread that will increment a global variable for + * msb (hpet is 32 bits by default under linux) */ + ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, + (void *(*)(void *))hpet_msb_inc, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); + internal_config.no_hpet = 1; + return -1; + } + + if (make_default) + eal_timer_source = EAL_TIMER_HPET; + return 0; +} +#endif + +static void +check_tsc_flags(void) +{ + char line[512]; + FILE *stream; + + stream = fopen("/proc/cpuinfo", "r"); + if (!stream) { + RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n"); + return; + } + + while (fgets(line, sizeof line, stream)) { + char *constant_tsc; + char *nonstop_tsc; + + if (strncmp(line, "flags", 5) != 0) + continue; + + constant_tsc = strstr(line, "constant_tsc"); + nonstop_tsc = strstr(line, "nonstop_tsc"); + if (!constant_tsc || !nonstop_tsc) + RTE_LOG(WARNING, EAL, + "WARNING: cpu flags " + "constant_tsc=%s " + "nonstop_tsc=%s " + "-> using unreliable clock cycles !\n", + constant_tsc ? "yes":"no", + nonstop_tsc ? "yes":"no"); + break; + } + + fclose(stream); +} + +uint64_t +get_tsc_freq(void) +{ +#ifdef CLOCK_MONOTONIC_RAW +#define NS_PER_SEC 1E9 + + struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ + + struct timespec t_start, t_end; + uint64_t tsc_hz; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { + uint64_t ns, end, start = rte_rdtsc(); + nanosleep(&sleeptime,NULL); + clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); + end = rte_rdtsc(); + ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); + ns += (t_end.tv_nsec - t_start.tv_nsec); + + double secs = (double)ns/NS_PER_SEC; + tsc_hz = (uint64_t)((end - start)/secs); + return tsc_hz; + } +#endif + return 0; +} + +int +rte_eal_timer_init(void) +{ + + eal_timer_source = EAL_TIMER_TSC; + + set_tsc_freq(); + check_tsc_flags(); + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c new file mode 100644 index 00000000..dcb21018 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -0,0 +1,1916 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_vfio.h" +#include "eal_private.h" + +#ifdef VFIO_PRESENT + +#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" + +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can + * recreate the mappings for DPDK segments, but we cannot do so for memory that + * was registered by the user themselves, so we need to store the user mappings + * somewhere, to recreate them later. + */ +#define VFIO_MAX_USER_MEM_MAPS 256 +struct user_mem_map { + uint64_t addr; + uint64_t iova; + uint64_t len; +}; + +struct user_mem_maps { + rte_spinlock_recursive_t lock; + int n_maps; + struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_active_groups; + const struct vfio_iommu_type *vfio_iommu_type; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; + struct user_mem_maps mem_maps; +}; + +/* per-process VFIO config */ +static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; +static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; + +static int vfio_type1_dma_map(int); +static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_spapr_dma_map(int); +static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_noiommu_dma_map(int); +static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, + uint64_t iova, uint64_t len, int do_map); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { + .type_id = RTE_VFIO_TYPE1, + .name = "Type 1", + .dma_map_func = &vfio_type1_dma_map, + .dma_user_map_func = &vfio_type1_dma_mem_map + }, + /* ppc64 IOMMU, otherwise known as spapr */ + { + .type_id = RTE_VFIO_SPAPR, + .name = "sPAPR", + .dma_map_func = &vfio_spapr_dma_map, + .dma_user_map_func = &vfio_spapr_dma_mem_map + }, + /* IOMMU-less mode */ + { + .type_id = RTE_VFIO_NOIOMMU, + .name = "No-IOMMU", + .dma_map_func = &vfio_noiommu_dma_map, + .dma_user_map_func = &vfio_noiommu_dma_mem_map + }, +}; + +static int +is_null_map(const struct user_mem_map *map) +{ + return map->addr == 0 && map->iova == 0 && map->len == 0; +} + +/* we may need to merge user mem maps together in case of user mapping/unmapping + * chunks of memory, so we'll need a comparator function to sort segments. + */ +static int +user_mem_map_cmp(const void *a, const void *b) +{ + const struct user_mem_map *umm_a = a; + const struct user_mem_map *umm_b = b; + + /* move null entries to end */ + if (is_null_map(umm_a)) + return 1; + if (is_null_map(umm_b)) + return -1; + + /* sort by iova first */ + if (umm_a->iova < umm_b->iova) + return -1; + if (umm_a->iova > umm_b->iova) + return 1; + + if (umm_a->addr < umm_b->addr) + return -1; + if (umm_a->addr > umm_b->addr) + return 1; + + if (umm_a->len < umm_b->len) + return -1; + if (umm_a->len > umm_b->len) + return 1; + + return 0; +} + +/* adjust user map entry. this may result in shortening of existing map, or in + * splitting existing map in two pieces. + */ +static void +adjust_map(struct user_mem_map *src, struct user_mem_map *end, + uint64_t remove_va_start, uint64_t remove_len) +{ + /* if va start is same as start address, we're simply moving start */ + if (remove_va_start == src->addr) { + src->addr += remove_len; + src->iova += remove_len; + src->len -= remove_len; + } else if (remove_va_start + remove_len == src->addr + src->len) { + /* we're shrinking mapping from the end */ + src->len -= remove_len; + } else { + /* we're blowing a hole in the middle */ + struct user_mem_map tmp; + uint64_t total_len = src->len; + + /* adjust source segment length */ + src->len = remove_va_start - src->addr; + + /* create temporary segment in the middle */ + tmp.addr = src->addr + src->len; + tmp.iova = src->iova + src->len; + tmp.len = remove_len; + + /* populate end segment - this one we will be keeping */ + end->addr = tmp.addr + tmp.len; + end->iova = tmp.iova + tmp.len; + end->len = total_len - src->len - tmp.len; + } +} + +/* try merging two maps into one, return 1 if succeeded */ +static int +merge_map(struct user_mem_map *left, struct user_mem_map *right) +{ + if (left->addr + left->len != right->addr) + return 0; + if (left->iova + left->len != right->iova) + return 0; + + left->len += right->len; + + memset(right, 0, sizeof(*right)); + + return 1; +} + +static struct user_mem_map * +find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, + uint64_t iova, uint64_t len) +{ + uint64_t va_end = addr + len; + uint64_t iova_end = iova + len; + int i; + + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = &user_mem_maps->maps[i]; + uint64_t map_va_end = map->addr + map->len; + uint64_t map_iova_end = map->iova + map->len; + + /* check start VA */ + if (addr < map->addr || addr >= map_va_end) + continue; + /* check if VA end is within boundaries */ + if (va_end <= map->addr || va_end > map_va_end) + continue; + + /* check start IOVA */ + if (iova < map->iova || iova >= map_iova_end) + continue; + /* check if IOVA end is within boundaries */ + if (iova_end <= map->iova || iova_end > map_iova_end) + continue; + + /* we've found our map */ + return map; + } + return NULL; +} + +/* this will sort all user maps, and merge/compact any adjacent maps */ +static void +compact_user_maps(struct user_mem_maps *user_mem_maps) +{ + int i, n_merged, cur_idx; + + qsort(user_mem_maps->maps, user_mem_maps->n_maps, + sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); + + /* we'll go over the list backwards when merging */ + n_merged = 0; + for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { + struct user_mem_map *l, *r; + + l = &user_mem_maps->maps[i]; + r = &user_mem_maps->maps[i + 1]; + + if (is_null_map(l) || is_null_map(r)) + continue; + + if (merge_map(l, r)) + n_merged++; + } + + /* the entries are still sorted, but now they have holes in them, so + * walk through the list and remove the holes + */ + if (n_merged > 0) { + cur_idx = 0; + for (i = 0; i < user_mem_maps->n_maps; i++) { + if (!is_null_map(&user_mem_maps->maps[i])) { + struct user_mem_map *src, *dst; + + src = &user_mem_maps->maps[i]; + dst = &user_mem_maps->maps[cur_idx++]; + + if (src != dst) { + memcpy(dst, src, sizeof(*src)); + memset(src, 0, sizeof(*src)); + } + } + } + user_mem_maps->n_maps = cur_idx; + } +} + +static int +vfio_open_group_fd(int iommu_group_num) +{ + int vfio_group_fd; + char filename[PATH_MAX]; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, + iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via mp channel. + */ + p->req = SOCKET_REQ_GROUP; + p->group_num = iommu_group_num; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_group_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_group_fd = mp_rep->fds[0]; + } else if (p->result == SOCKET_NO_FD) { + RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); + vfio_group_fd = 0; + } + free(mp_reply.msgs); + } + + if (vfio_group_fd < 0) + RTE_LOG(ERR, EAL, " cannot request group fd\n"); + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_num(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + if (vfio_cfg->vfio_groups[j].group_num == + iommu_group_num) + return vfio_cfg; + } + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + int i; + int vfio_group_fd; + struct vfio_group *cur_grp; + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + /* check if we already have the group descriptor open */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) + return vfio_cfg->vfio_groups[i].fd; + + /* Lets see first if there is room for a new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Now lets get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +static int +get_vfio_group_idx(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return j; + } + + return -1; +} + +static void +vfio_group_device_get(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices++; +} + +static void +vfio_group_device_put(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices--; +} + +static int +vfio_group_device_count(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + return -1; + } + + return vfio_cfg->vfio_groups[i].devices; +} + +static void +vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + void *arg __rte_unused) +{ + struct rte_memseg_list *msl; + struct rte_memseg *ms; + size_t cur_len = 0; + + msl = rte_mem_virt2memseg_list(addr); + + /* for IOVA as VA mode, no need to care for IOVA addresses */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) { + uint64_t vfio_va = (uint64_t)(uintptr_t)addr; + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 0); + return; + } + + /* memsegs are contiguous in memory */ + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, + ms->iova, ms->len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, + ms->iova, ms->len, 0); + + cur_len += ms->len; + ++ms; + } +} + +int +rte_vfio_clear_group(int vfio_group_fd) +{ + int i; + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0) + return -1; + vfio_cfg->vfio_groups[i].group_num = -1; + vfio_cfg->vfio_groups[i].fd = -1; + vfio_cfg->vfio_groups[i].devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int vfio_container_fd; + int vfio_group_fd; + int iommu_group_num; + int i, ret; + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd < 0) + return -1; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + vfio_container_fd = vfio_cfg->vfio_container_fd; + user_mem_maps = &vfio_cfg->mem_maps; + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when first group is + * assigned to a container and only in primary process. + * Note this can happen several times with the hotplug + * functionality. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg->vfio_active_groups == 1 && + vfio_group_device_count(vfio_group_fd) == 0) { + const struct vfio_iommu_type *t; + + /* select an IOMMU type which we will be using */ + t = vfio_set_iommu_type(vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, + " %s failed to select IOMMU type\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* lock memory hotplug before mapping and release it + * after registering callback, to prevent races + */ + rte_rwlock_read_lock(mem_lock); + if (vfio_cfg == default_vfio_cfg && + (internal_config.single_file_segments == 0 || + internal_config.legacy_mem == 0)) + ret = t->dma_map_func(vfio_container_fd); + else + ret = 0; + if (ret) { + RTE_LOG(ERR, EAL, + " %s DMA remapping failed, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + rte_rwlock_read_unlock(mem_lock); + return -1; + } + + vfio_cfg->vfio_iommu_type = t; + + /* re-map all user-mapped segments */ + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* this IOMMU type may not support DMA mapping, but + * if we have mappings in the list - that means we have + * previously mapped something successfully, so we can + * be sure that DMA mapping is supported. + */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map; + map = &user_mem_maps->maps[i]; + + ret = t->dma_user_map_func( + vfio_container_fd, + map->addr, map->iova, map->len, + 1); + if (ret) { + RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " + "va: 0x%" PRIx64 " " + "iova: 0x%" PRIx64 " " + "len: 0x%" PRIu64 "\n", + map->addr, map->iova, + map->len); + rte_spinlock_recursive_unlock( + &user_mem_maps->lock); + rte_rwlock_read_unlock(mem_lock); + return -1; + } + } + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + + /* register callback for mem events */ + if (vfio_cfg == default_vfio_cfg) + ret = rte_mem_event_callback_register( + VFIO_MEM_EVENT_CLB_NAME, + vfio_mem_event_callback, NULL); + else + ret = 0; + /* unlock memory hotplug */ + rte_rwlock_read_unlock(mem_lock); + + if (ret && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); + return -1; + } + if (ret) + RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); + else + RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); + } + } + + /* get a file descriptor for the device */ + *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (*vfio_dev_fd < 0) { + /* if we cannot get a device fd, this implies a problem with + * the VFIO group or the container not having IOMMU configured. + */ + + RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* test and setup the device */ + ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", dev_addr, errno, + strerror(errno)); + close(*vfio_dev_fd); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + vfio_group_device_get(vfio_group_fd); + + return 0; +} + +int +rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, + int vfio_dev_fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + int vfio_group_fd; + int iommu_group_num; + int ret; + + /* we don't want any DMA mapping messages to come while we're detaching + * VFIO device, because this might be the last device and we might need + * to unregister the callback. + */ + rte_rwlock_read_lock(mem_lock); + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret <= 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", + dev_addr); + /* This is an error at this point. */ + ret = -1; + goto out; + } + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd <= 0) { + RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + /* At this point we got an active group. Closing it will make the + * container detachment. If this is the last active group, VFIO kernel + * code will unset the container and the IOMMU mappings. + */ + + /* Closing a device */ + if (close(vfio_dev_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* An VFIO group can have several devices attached. Just when there is + * no devices remaining should the group be closed. + */ + vfio_group_device_put(vfio_group_fd); + if (!vfio_group_device_count(vfio_group_fd)) { + + if (close(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + if (rte_vfio_clear_group(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", + dev_addr); + ret = -1; + goto out; + } + } + + /* if there are no active device groups, unregister the callback to + * avoid spurious attempts to map/unmap memory from VFIO. + */ + if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0) + rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, + NULL); + + /* success */ + ret = 0; + +out: + rte_rwlock_read_unlock(mem_lock); + return ret; +} + +int +rte_vfio_enable(const char *modname) +{ + /* initialize group list */ + int i, j; + int vfio_available; + + rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfgs[i].vfio_container_fd = -1; + vfio_cfgs[i].vfio_active_groups = 0; + vfio_cfgs[i].vfio_iommu_type = NULL; + vfio_cfgs[i].mem_maps.lock = lock; + + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + vfio_cfgs[i].vfio_groups[j].fd = -1; + vfio_cfgs[i].vfio_groups[j].group_num = -1; + vfio_cfgs[i].vfio_groups[j].devices = 0; + } + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio module is loaded */ + vfio_available = rte_eal_check_module(modname); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); + + /* check if we have VFIO driver enabled */ + if (default_vfio_cfg->vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + default_vfio_cfg->vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +rte_vfio_is_enabled(const char *modname) +{ + const int mod_available = rte_eal_check_module(modname) > 0; + return default_vfio_cfg->vfio_enabled && mod_available; +} + +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd) +{ + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +int +vfio_has_supported_extensions(int vfio_container_fd) +{ + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +int +rte_vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } + /* + * if we're in a secondary process, request container fd from the + * primary process via mp channel + */ + p->req = SOCKET_REQ_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_container_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; + } + free(mp_reply.msgs); + } + + RTE_LOG(ERR, EAL, " cannot request container fd\n"); + return -1; +} + +int +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + "%s/%s/iommu_group", sysfs_base, dev_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_num = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); + return -1; + } + + return 1; +} + +static int +type1_map(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +static int +vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_type1_dma_map(int vfio_container_fd) +{ + return rte_memseg_walk(type1_map, &vfio_container_fd); +} + +static int +vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + } else { + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; + + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +}; +static int +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct spapr_walk_param *param = arg; + uint64_t max = ms->iova + ms->len; + + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; + } + + return 0; +} + +static int +vfio_spapr_create_new_dma_window(int vfio_container_fd, + struct vfio_iommu_spapr_tce_create *create) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + }; + struct vfio_iommu_spapr_tce_info info = { + .argsz = sizeof(info), + }; + int ret; + + /* query spapr iommu info */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + RTE_LOG(ERR, EAL, " cannot get iommu info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* remove default DMA of 32 bit window */ + remove.start_addr = info.dma32_window_start; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + RTE_LOG(ERR, EAL, " cannot remove default DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* create new DMA window */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); + if (ret) { + RTE_LOG(ERR, EAL, " cannot create new DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + if (create->start_addr != 0) { + RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); + return -1; + } + + return 0; +} + +static int +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct spapr_walk_param param; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int i, ret = 0; + + vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid container fd!\n"); + return -1; + } + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); + + /* we're inside a callback so use thread-unsafe version */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, + ¶m) < 0) { + RTE_LOG(ERR, EAL, "Could not get window size\n"); + ret = -1; + goto out; + } + + /* also check user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + uint64_t max = user_mem_maps->maps[i].iova + + user_mem_maps->maps[i].len; + create.window_size = RTE_MAX(create.window_size, max); + } + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (do_map) { + void *addr; + /* re-create window and remap the entire memory */ + if (iova > create.window_size) { + if (vfio_spapr_create_new_dma_window(vfio_container_fd, + &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + ret = -1; + goto out; + } + /* we're inside a callback, so use thread-unsafe version + */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; + } + /* remap all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 1)) { + RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); + ret = -1; + goto out; + } + } + } + + /* now that we've remapped all of the memory that was present + * before, map the segment that we were requested to map. + * + * however, if we were called by the callback, the memory we + * were called with was already in the memseg list, so previous + * mapping should've mapped that segment already. + * + * virt2memseg_list is a relatively cheap check, so use that. if + * memory is within any memseg list, it's a memseg, so it's + * already mapped. + */ + addr = (void *)(uintptr_t)vaddr; + if (rte_mem_virt2memseg_list(addr) == NULL && + vfio_spapr_dma_do_map(vfio_container_fd, + vaddr, iova, len, 1) < 0) { + RTE_LOG(ERR, EAL, "Could not map segment\n"); + ret = -1; + goto out; + } + } else { + /* for unmap, check if iova within DMA window */ + if (iova > create.window_size) { + RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); + ret = -1; + goto out; + } + + vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); + } +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) + return -1; + + return 0; +} + +static int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, + uint64_t __rte_unused vaddr, + uint64_t __rte_unused iova, uint64_t __rte_unused len, + int __rte_unused do_map) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; + + if (!t) { + RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); + rte_errno = ENODEV; + return -1; + } + + if (!t->dma_user_map_func) { + RTE_LOG(ERR, EAL, + " VFIO custom DMA region maping not supported by IOMMU %s\n", + t->name); + rte_errno = ENOTSUP; + return -1; + } + + return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, + len, do_map); +} + +static int +container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *new_map; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + /* map the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { + /* technically, this will fail if there are currently no devices + * plugged in, even if a device were added later, this mapping + * might have succeeded. however, since we cannot verify if this + * is a valid mapping without having a device attached, consider + * this to be unsupported, because we can't just store any old + * mapping and pollute list of active mappings willy-nilly. + */ + RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); + ret = -1; + goto out; + } + /* create new user mem map entry */ + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + new_map->addr = vaddr; + new_map->iova = iova; + new_map->len = len; + + compact_user_maps(user_mem_maps); +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *map, *new_map = NULL; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* find our mapping */ + map = find_user_mem_map(user_mem_maps, vaddr, iova, len); + if (!map) { + RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); + rte_errno = EINVAL; + ret = -1; + goto out; + } + if (map->addr != vaddr || map->iova != iova || map->len != len) { + /* we're partially unmapping a previously mapped region, so we + * need to split entry into two. + */ + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + } + + /* unmap the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { + /* there may not be any devices plugged in, so unmapping will + * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't + * stop us from removing the mapping, as the assumption is we + * won't be needing this memory any more and thus will want to + * prevent it from being remapped again on hotplug. so, only + * fail if we indeed failed to unmap (e.g. if the mapping was + * within our mapped range but had invalid alignment). + */ + if (rte_errno != ENODEV && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); + ret = -1; + goto out; + } else { + RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); + } + } + /* remove map from the list of active mappings */ + if (new_map != NULL) { + adjust_map(map, new_map, vaddr, len); + + /* if we've created a new map by splitting, sort everything */ + if (!is_null_map(new_map)) { + compact_user_maps(user_mem_maps); + } else { + /* we've created a new mapping, but it was unused */ + user_mem_maps->n_maps--; + } + } else { + memset(map, 0, sizeof(*map)); + compact_user_maps(user_mem_maps); + user_mem_maps->n_maps--; + } + +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +int +rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + return container_dma_map(default_vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + return container_dma_unmap(default_vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + int fd; + ssize_t cnt; + char c; + + fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); + if (fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", + errno, strerror(errno)); + return -1; + } + /* + * else the file does not exists + * i.e. noiommu is not enabled + */ + return 0; + } + + cnt = read(fd, &c, 1); + close(fd); + if (cnt != 1) { + RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " + "file %i (%s)\n", errno, strerror(errno)); + return -1; + } + + return c == 'Y'; +} + +int +rte_vfio_container_create(void) +{ + int i; + + /* Find an empty slot to store new vfio config */ + for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == -1) + break; + } + + if (i == VFIO_MAX_CONTAINERS) { + RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); + return -1; + } + + vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); + if (vfio_cfgs[i].vfio_container_fd < 0) { + RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); + return -1; + } + + return vfio_cfgs[i].vfio_container_fd; +} + +int __rte_experimental +rte_vfio_container_destroy(int container_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num != -1) + rte_vfio_container_group_unbind(container_fd, + vfio_cfg->vfio_groups[i].group_num); + + close(container_fd); + vfio_cfg->vfio_container_fd = -1; + vfio_cfg->vfio_active_groups = 0; + vfio_cfg->vfio_iommu_type = NULL; + + return 0; +} + +int +rte_vfio_container_group_bind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp; + int vfio_group_fd; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + /* Check room for new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +int +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp = NULL; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { + RTE_LOG(ERR, EAL, "Specified group number not found\n"); + return -1; + } + + if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { + RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" + " iommu_group_num %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = -1; + cur_grp->fd = -1; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_map(vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_unmap(vfio_cfg, vaddr, iova, len); +} + +#else + +int +rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int +rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, __rte_unused int fd) +{ + return -1; +} + +int +rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + return -1; +} + +int +rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return -1; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +#endif /* VFIO_PRESENT */ diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h new file mode 100644 index 00000000..68d4750a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_VFIO_H_ +#define EAL_VFIO_H_ + +/* + * determine if VFIO is present on the system + */ +#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_PRESENT +#else +#pragma message("VFIO configured but not supported by this kernel, disabling.") +#endif /* kernel version >= 3.6.0 */ +#endif /* RTE_EAL_VFIO */ + +#ifdef VFIO_PRESENT + +#include +#include + +#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU + +#ifndef VFIO_SPAPR_TCE_v2_IOMMU +#define RTE_VFIO_SPAPR 7 +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +struct vfio_iommu_spapr_register_memory { + uint32_t argsz; + uint32_t flags; + uint64_t vaddr; + uint64_t size; +}; + +struct vfio_iommu_spapr_tce_create { + uint32_t argsz; + uint32_t flags; + /* in */ + uint32_t page_shift; + uint32_t __resv1; + uint64_t window_size; + uint32_t levels; + uint32_t __resv2; + /* out */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_remove { + uint32_t argsz; + uint32_t flags; + /* in */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_ddw_info { + uint64_t pgsizes; + uint32_t max_dynamic_windows_supported; + uint32_t levels; +}; + +/* SPAPR_v2 is not present, but SPAPR might be */ +#ifndef VFIO_SPAPR_TCE_IOMMU +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct vfio_iommu_spapr_tce_info { + uint32_t argsz; + uint32_t flags; + uint32_t dma32_window_start; + uint32_t dma32_window_size; + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; +#endif /* VFIO_SPAPR_TCE_IOMMU */ + +#else /* VFIO_SPAPR_TCE_v2_IOMMU */ +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU +#endif + +#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS +#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_num; + int fd; + int devices; +}; + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +/* Custom memory region DMA mapping function prototype. + * Takes VFIO container fd, virtual address, phisical address, length and + * operation type (0 to unmap 1 for map) as a parameters. + * Returns 0 on success, -1 on error. + **/ +typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_user_func_t dma_user_map_func; + vfio_dma_func_t dma_map_func; +}; + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd); + +/* check if we have any supported extensions */ +int +vfio_has_supported_extensions(int vfio_container_fd); + +int vfio_mp_sync_setup(void); + +#define EAL_VFIO_MP "eal_vfio_mp_sync" + +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + +struct vfio_mp_param { + int req; + int result; + int group_num; +}; + +#endif /* VFIO_PRESENT */ + +#endif /* EAL_VFIO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c new file mode 100644 index 00000000..680a24aa --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include + +#include "eal_vfio.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +static int +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) +{ + int fd = -1; + int ret; + struct rte_mp_msg reply; + struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; + const struct vfio_mp_param *m = + (const struct vfio_mp_param *)msg->param; + + if (msg->len_param != sizeof(*m)) { + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + switch (m->req) { + case SOCKET_REQ_GROUP: + r->req = SOCKET_REQ_GROUP; + r->group_num = m->group_num; + fd = rte_vfio_get_group_fd(m->group_num); + if (fd < 0) + r->result = SOCKET_ERR; + else if (fd == 0) + /* if VFIO group exists but isn't bound to VFIO driver */ + r->result = SOCKET_NO_FD; + else { + /* if group exists and is bound to VFIO driver */ + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_CONTAINER: + r->req = SOCKET_REQ_CONTAINER; + fd = rte_vfio_get_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + default: + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + strcpy(reply.name, EAL_VFIO_MP); + reply.len_param = sizeof(*r); + + ret = rte_mp_reply(&reply, peer); + if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) + close(fd); + return ret; +} + +int +vfio_mp_sync_setup(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); + + return 0; +} + +#endif diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h new file mode 100644 index 00000000..cfa9448b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */ +/* + * Copyright(c) 2007-2014 Intel Corporation. + */ + +#ifndef _RTE_KNI_COMMON_H_ +#define _RTE_KNI_COMMON_H_ + +#ifdef __KERNEL__ +#include +#define RTE_STD_C11 +#else +#include +#include +#endif + +/** + * KNI name is part of memzone name. + */ +#define RTE_KNI_NAMESIZE 32 + +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/* + * Request id. + */ +enum rte_kni_req_id { + RTE_KNI_REQ_UNKNOWN = 0, + RTE_KNI_REQ_CHANGE_MTU, + RTE_KNI_REQ_CFG_NETWORK_IF, + RTE_KNI_REQ_CHANGE_MAC_ADDR, + RTE_KNI_REQ_CHANGE_PROMISC, + RTE_KNI_REQ_MAX, +}; + +/* + * Structure for KNI request. + */ +struct rte_kni_request { + uint32_t req_id; /**< Request id */ + RTE_STD_C11 + union { + uint32_t new_mtu; /**< New MTU */ + uint8_t if_up; /**< 1: interface up, 0: interface down */ + uint8_t mac_addr[6]; /**< MAC address for interface */ + uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */ + }; + int32_t result; /**< Result for processing request */ +} __attribute__((__packed__)); + +/* + * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO + * Write and read should wrap around. Fifo is empty when write == read + * Writing should never overwrite the read position + */ +struct rte_kni_fifo { + volatile unsigned write; /**< Next position to be written*/ + volatile unsigned read; /**< Next position to be read */ + unsigned len; /**< Circular buffer length */ + unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ + void *volatile buffer[]; /**< The buffer contains mbuf pointers */ +}; + +/* + * The kernel image of the rte_mbuf struct, with only the relevant fields. + * Padding is necessary to assure the offsets of these fields + */ +struct rte_kni_mbuf { + void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + uint64_t buf_physaddr; + uint16_t data_off; /**< Start address of data in segment buffer. */ + char pad1[2]; + uint16_t nb_segs; /**< Number of segments. */ + char pad4[2]; + uint64_t ol_flags; /**< Offload features. */ + char pad2[4]; + uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + + /* fields on second cache line */ + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); + void *pool; + void *next; +}; + +/* + * Struct used to create a KNI device. Passed to the kernel in IOCTL call + */ + +struct rte_kni_device_info { + char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ + + phys_addr_t tx_phys; + phys_addr_t rx_phys; + phys_addr_t alloc_phys; + phys_addr_t free_phys; + + /* Used by Ethtool */ + phys_addr_t req_phys; + phys_addr_t resp_phys; + phys_addr_t sync_phys; + void * sync_va; + + /* mbuf mempool */ + void * mbuf_va; + phys_addr_t mbuf_phys; + + /* PCI info */ + uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ + uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ + uint8_t bus; /**< Device bus */ + uint8_t devid; /**< Device ID */ + uint8_t function; /**< Device function. */ + + uint16_t group_id; /**< Group ID */ + uint32_t core_id; /**< core ID to bind for kernel thread */ + + __extension__ + uint8_t force_bind : 1; /**< Flag for kernel thread binding */ + + /* mbuf size */ + unsigned mbuf_size; + unsigned int mtu; + char mac_addr[6]; +}; + +#define KNI_DEVICE "kni" + +#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) +#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) +#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) + +#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/meson.build b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/meson.build new file mode 100644 index 00000000..6e31c2aa --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/meson.build @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +eal_inc += include_directories('include', '../../../librte_compat') +install_subdir('include/exec-env', install_dir: get_option('includedir')) + +env_objs = [] +env_headers = [] +env_sources = files('eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_memalloc.c', + 'eal_lcore.c', + 'eal_log.c', + 'eal_thread.c', + 'eal_timer.c', + 'eal_vfio.c', + 'eal_vfio_mp_sync.c', + 'eal.c', + 'eal_memory.c', + 'eal_dev.c', +) + +deps += ['kvargs'] +if has_libnuma == 1 + dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) +endif diff --git a/src/spdk/dpdk/lib/librte_eal/meson.build b/src/spdk/dpdk/lib/librte_eal/meson.build new file mode 100644 index 00000000..e1fde15d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/meson.build @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +# Custom EAL processing. EAL is complicated enough that it can't just +# have a straight list of headers and source files. +# Initially pull in common settings +eal_inc = [global_inc] +subdir('common') # defines common_sources, common_objs, etc. + +# Now do OS/exec-env specific settings, including building kernel modules +# The /eal/meson.build file should define env_sources, etc. +if host_machine.system() == 'linux' + dpdk_conf.set('RTE_EXEC_ENV_LINUXAPP', 1) + subdir('linuxapp/eal') + +elif host_machine.system() == 'freebsd' + dpdk_conf.set('RTE_EXEC_ENV_BSDAPP', 1) + subdir('bsdapp/eal') + +else + error('unsupported system type "@0@"'.format(host_machine.system())) +endif + +version = 8 # the version of the EAL API +allow_experimental_apis = true +deps += 'compat' +deps += 'kvargs' +cflags += '-D_GNU_SOURCE' +sources = common_sources + env_sources +objs = common_objs + env_objs +headers = common_headers + env_headers +includes = eal_inc diff --git a/src/spdk/dpdk/lib/librte_eal/rte_eal_version.map b/src/spdk/dpdk/lib/librte_eal/rte_eal_version.map new file mode 100644 index 00000000..344a43d3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/rte_eal_version.map @@ -0,0 +1,338 @@ +DPDK_2.0 { + global: + + __rte_panic; + eal_parse_sysfs_value; + eal_timer_source; + lcore_config; + per_lcore__lcore_id; + per_lcore__rte_errno; + rte_calloc; + rte_calloc_socket; + rte_cpu_check_supported; + rte_cpu_get_flag_enabled; + rte_cycles_vmware_tsc_map; + rte_delay_us; + rte_dump_physmem_layout; + rte_dump_registers; + rte_dump_stack; + rte_dump_tailq; + rte_eal_alarm_cancel; + rte_eal_alarm_set; + rte_eal_devargs_add; + rte_eal_devargs_dump; + rte_eal_devargs_type_count; + rte_eal_get_configuration; + rte_eal_get_lcore_state; + rte_eal_get_physmem_size; + rte_eal_has_hugepages; + rte_eal_hpet_init; + rte_eal_init; + rte_eal_iopl_init; + rte_eal_lcore_role; + rte_eal_mp_remote_launch; + rte_eal_mp_wait_lcore; + rte_eal_parse_devargs_str; + rte_eal_process_type; + rte_eal_remote_launch; + rte_eal_tailq_lookup; + rte_eal_tailq_register; + rte_eal_wait_lcore; + rte_exit; + rte_free; + rte_get_hpet_cycles; + rte_get_hpet_hz; + rte_get_tsc_hz; + rte_hexdump; + rte_intr_callback_register; + rte_intr_callback_unregister; + rte_intr_disable; + rte_intr_enable; + rte_log; + rte_log_cur_msg_loglevel; + rte_log_cur_msg_logtype; + rte_logs; + rte_malloc; + rte_malloc_dump_stats; + rte_malloc_get_socket_stats; + rte_malloc_set_limit; + rte_malloc_socket; + rte_malloc_validate; + rte_mem_lock_page; + rte_mem_virt2phy; + rte_memdump; + rte_memory_get_nchannel; + rte_memory_get_nrank; + rte_memzone_dump; + rte_memzone_lookup; + rte_memzone_reserve; + rte_memzone_reserve_aligned; + rte_memzone_reserve_bounded; + rte_memzone_walk; + rte_openlog_stream; + rte_realloc; + rte_set_application_usage_hook; + rte_socket_id; + rte_strerror; + rte_strsplit; + rte_sys_gettid; + rte_thread_get_affinity; + rte_thread_set_affinity; + rte_vlog; + rte_zmalloc; + rte_zmalloc_socket; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_epoll_ctl; + rte_epoll_wait; + rte_intr_allow_others; + rte_intr_dp_is_en; + rte_intr_efd_disable; + rte_intr_efd_enable; + rte_intr_rx_ctl; + rte_intr_tls_epfd; + rte_memzone_free; + +} DPDK_2.0; + +DPDK_2.2 { + global: + + rte_intr_cap_multiple; + rte_keepalive_create; + rte_keepalive_dispatch_pings; + rte_keepalive_mark_alive; + rte_keepalive_register_core; + +} DPDK_2.1; + +DPDK_16.04 { + global: + + rte_cpu_get_flag_name; + rte_eal_primary_proc_alive; + +} DPDK_2.2; + +DPDK_16.07 { + global: + + rte_keepalive_mark_sleep; + rte_keepalive_register_relay_callback; + rte_rtm_supported; + rte_thread_setname; + +} DPDK_16.04; + +DPDK_16.11 { + global: + + rte_delay_us_block; + rte_delay_us_callback_register; + rte_eal_dev_attach; + rte_eal_dev_detach; + +} DPDK_16.07; + +DPDK_17.02 { + global: + + rte_bus_dump; + rte_bus_probe; + rte_bus_register; + rte_bus_scan; + rte_bus_unregister; + +} DPDK_16.11; + +DPDK_17.05 { + global: + + rte_cpu_is_supported; + rte_intr_free_epoll_fd; + rte_log_dump; + rte_log_get_global_level; + rte_log_register; + rte_log_set_global_level; + rte_log_set_level; + rte_log_set_level_regexp; + +} DPDK_17.02; + +DPDK_17.08 { + global: + + rte_bus_find; + rte_bus_find_by_device; + rte_bus_find_by_name; + rte_log_get_level; + +} DPDK_17.05; + +DPDK_17.11 { + global: + + rte_eal_create_uio_dev; + rte_bus_get_iommu_class; + rte_eal_has_pci; + rte_eal_iova_mode; + rte_eal_using_phys_addrs; + rte_eal_vfio_intr_mode; + rte_lcore_has_role; + rte_malloc_virt2iova; + rte_mem_virt2iova; + rte_vfio_enable; + rte_vfio_is_enabled; + rte_vfio_noiommu_is_enabled; + rte_vfio_release_device; + rte_vfio_setup_device; + +} DPDK_17.08; + +DPDK_18.02 { + global: + + rte_hypervisor_get; + rte_hypervisor_get_name; + rte_vfio_clear_group; + rte_reciprocal_value; + rte_reciprocal_value_u64; + +} DPDK_17.11; + +DPDK_18.05 { + global: + + rte_log_set_level_pattern; + rte_service_attr_get; + rte_service_attr_reset_all; + rte_service_component_register; + rte_service_component_runstate_set; + rte_service_component_unregister; + rte_service_dump; + rte_service_finalize; + rte_service_get_by_id; + rte_service_get_by_name; + rte_service_get_count; + rte_service_get_name; + rte_service_lcore_add; + rte_service_lcore_count; + rte_service_lcore_count_services; + rte_service_lcore_del; + rte_service_lcore_list; + rte_service_lcore_reset_all; + rte_service_lcore_start; + rte_service_lcore_stop; + rte_service_map_lcore_get; + rte_service_map_lcore_set; + rte_service_probe_capability; + rte_service_run_iter_on_app_lcore; + rte_service_runstate_get; + rte_service_runstate_set; + rte_service_set_runstate_mapped_check; + rte_service_set_stats_enable; + rte_service_start_with_defaults; + +} DPDK_18.02; + +DPDK_18.08 { + global: + + rte_eal_mbuf_user_pool_ops; + rte_uuid_compare; + rte_uuid_is_null; + rte_uuid_parse; + rte_uuid_unparse; + rte_vfio_container_create; + rte_vfio_container_destroy; + rte_vfio_container_dma_map; + rte_vfio_container_dma_unmap; + rte_vfio_container_group_bind; + rte_vfio_container_group_unbind; + rte_vfio_dma_map; + rte_vfio_dma_unmap; + rte_vfio_get_container_fd; + rte_vfio_get_group_fd; + rte_vfio_get_group_num; + +} DPDK_18.05; + +EXPERIMENTAL { + global: + + rte_class_find; + rte_class_find_by_name; + rte_class_register; + rte_class_unregister; + rte_ctrl_thread_create; + rte_dev_event_callback_register; + rte_dev_event_callback_unregister; + rte_dev_event_monitor_start; + rte_dev_event_monitor_stop; + rte_dev_iterator_init; + rte_dev_iterator_next; + rte_devargs_add; + rte_devargs_dump; + rte_devargs_insert; + rte_devargs_next; + rte_devargs_parse; + rte_devargs_parsef; + rte_devargs_remove; + rte_devargs_type_count; + rte_eal_cleanup; + rte_eal_hotplug_add; + rte_eal_hotplug_remove; + rte_fbarray_attach; + rte_fbarray_destroy; + rte_fbarray_detach; + rte_fbarray_dump_metadata; + rte_fbarray_find_idx; + rte_fbarray_find_next_free; + rte_fbarray_find_next_used; + rte_fbarray_find_next_n_free; + rte_fbarray_find_next_n_used; + rte_fbarray_find_prev_free; + rte_fbarray_find_prev_used; + rte_fbarray_find_prev_n_free; + rte_fbarray_find_prev_n_used; + rte_fbarray_find_contig_free; + rte_fbarray_find_contig_used; + rte_fbarray_find_rev_contig_free; + rte_fbarray_find_rev_contig_used; + rte_fbarray_get; + rte_fbarray_init; + rte_fbarray_is_used; + rte_fbarray_set_free; + rte_fbarray_set_used; + rte_log_register_type_and_pick_level; + rte_malloc_dump_heaps; + rte_mem_alloc_validator_register; + rte_mem_alloc_validator_unregister; + rte_mem_event_callback_register; + rte_mem_event_callback_unregister; + rte_mem_iova2virt; + rte_mem_virt2memseg; + rte_mem_virt2memseg_list; + rte_memseg_contig_walk; + rte_memseg_contig_walk_thread_unsafe; + rte_memseg_list_walk; + rte_memseg_list_walk_thread_unsafe; + rte_memseg_walk; + rte_memseg_walk_thread_unsafe; + rte_mp_action_register; + rte_mp_action_unregister; + rte_mp_reply; + rte_mp_request_sync; + rte_mp_request_async; + rte_mp_sendmsg; + rte_service_lcore_attr_get; + rte_service_lcore_attr_reset_all; + rte_service_may_be_active; + rte_socket_count; + rte_socket_id_by_idx; +}; diff --git a/src/spdk/dpdk/lib/librte_efd/Makefile b/src/spdk/dpdk/lib/librte_efd/Makefile new file mode 100644 index 00000000..5f5fcfce --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2016-2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_efd.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal -lrte_ring -lrte_hash + +EXPORT_MAP := rte_efd_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_EFD) := rte_efd.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_EFD)-include := rte_efd.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_efd/meson.build b/src/spdk/dpdk/lib/librte_efd/meson.build new file mode 100644 index 00000000..07fb1c29 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_efd.c') +headers = files('rte_efd.h') +deps += ['ring', 'hash'] diff --git a/src/spdk/dpdk/lib/librte_efd/rte_efd.c b/src/spdk/dpdk/lib/librte_efd/rte_efd.c new file mode 100644 index 00000000..a780e2fe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/rte_efd.c @@ -0,0 +1,1335 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_efd.h" +#if defined(RTE_ARCH_X86) +#include "rte_efd_x86.h" +#elif defined(RTE_ARCH_ARM64) +#include "rte_efd_arm64.h" +#endif + +#define EFD_KEY(key_idx, table) (table->keys + ((key_idx) * table->key_len)) +/** Hash function used to determine chunk_id and bin_id for a group */ +#define EFD_HASH(key, table) \ + (uint32_t)(rte_jhash(key, table->key_len, 0xbc9f1d34)) +/** Hash function used as constant component of perfect hash search */ +#define EFD_HASHFUNCA(key, table) \ + (uint32_t)(rte_hash_crc(key, table->key_len, 0xbc9f1d35)) +/** Hash function used as multiplicative component of perfect hash search */ +#define EFD_HASHFUNCB(key, table) \ + (uint32_t)(rte_hash_crc(key, table->key_len, 0xbc9f1d36)) + +/************************************************************************* + * Fixed constants + *************************************************************************/ + +/* These parameters are fixed by the efd_bin_to_group balancing table */ +#define EFD_CHUNK_NUM_GROUPS (64) +#define EFD_CHUNK_NUM_BINS (256) +#define EFD_CHUNK_NUM_BIN_TO_GROUP_SETS \ + (EFD_CHUNK_NUM_BINS / EFD_CHUNK_NUM_GROUPS) + +/* + * Target number of rules that each chunk is created to handle. + * Used when initially allocating the table + */ +#define EFD_TARGET_CHUNK_NUM_RULES \ + (EFD_CHUNK_NUM_GROUPS * EFD_TARGET_GROUP_NUM_RULES) +/* + * Max number of rules that each chunk is created to handle. + * Used when initially allocating the table + */ +#define EFD_TARGET_CHUNK_MAX_NUM_RULES \ + (EFD_CHUNK_NUM_GROUPS * EFD_MAX_GROUP_NUM_RULES) + +/** This is fixed based on the bin_to_group permutation array */ +#define EFD_MAX_GROUP_NUM_BINS (16) + +/** + * The end of the chunks array needs some extra padding to ensure + * that vectorization over-reads on the last online chunk stay within +allocated memory + */ +#define EFD_NUM_CHUNK_PADDING_BYTES (256) + +/* All different internal lookup functions */ +enum efd_lookup_internal_function { + EFD_LOOKUP_SCALAR = 0, + EFD_LOOKUP_AVX2, + EFD_LOOKUP_NEON, + EFD_LOOKUP_NUM +}; + +TAILQ_HEAD(rte_efd_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_efd_tailq = { + .name = "RTE_EFD", +}; +EAL_REGISTER_TAILQ(rte_efd_tailq); + +/** Internal permutation array used to shuffle bins into pseudorandom groups */ +const uint32_t efd_bin_to_group[EFD_CHUNK_NUM_BIN_TO_GROUP_SETS][EFD_CHUNK_NUM_BINS] = { + { + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, + 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, + 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, + 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, + 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, + 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, + 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, + 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, + 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, + 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, + 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63 + }, + { + 34, 33, 48, 59, 0, 21, 36, 18, 9, 49, 54, 38, 51, 23, 31, 5, + 44, 23, 37, 52, 11, 4, 58, 20, 38, 40, 38, 22, 26, 28, 42, 6, + 46, 16, 31, 28, 46, 14, 60, 0, 35, 53, 16, 58, 16, 29, 39, 7, + 1, 54, 15, 11, 48, 3, 62, 9, 58, 5, 30, 43, 17, 7, 36, 34, + 6, 36, 2, 14, 10, 1, 47, 47, 20, 45, 62, 56, 34, 25, 39, 18, + 51, 41, 61, 25, 56, 40, 41, 37, 52, 35, 30, 57, 11, 42, 37, 27, + 54, 19, 26, 13, 48, 31, 46, 15, 12, 10, 16, 20, 43, 17, 12, 55, + 45, 18, 8, 41, 7, 31, 42, 63, 12, 14, 21, 57, 24, 40, 5, 41, + 13, 44, 23, 59, 25, 57, 52, 50, 62, 1, 2, 49, 32, 57, 26, 43, + 56, 60, 55, 5, 49, 6, 3, 50, 46, 39, 27, 33, 17, 4, 53, 13, + 2, 19, 36, 51, 63, 0, 22, 33, 59, 28, 29, 23, 45, 33, 53, 27, + 22, 21, 40, 56, 4, 18, 44, 47, 28, 17, 4, 50, 21, 62, 8, 39, + 0, 8, 15, 24, 29, 24, 9, 11, 48, 61, 35, 55, 43, 1, 54, 42, + 53, 60, 22, 3, 32, 52, 25, 8, 15, 60, 7, 55, 27, 63, 19, 10, + 63, 24, 61, 19, 12, 38, 6, 29, 13, 37, 10, 3, 45, 32, 32, 30, + 49, 61, 44, 14, 20, 58, 35, 30, 2, 26, 34, 51, 9, 59, 47, 50 + }, + { + 32, 35, 32, 34, 55, 5, 6, 23, 49, 11, 6, 23, 52, 37, 29, 54, + 55, 40, 63, 50, 29, 52, 61, 25, 12, 56, 39, 38, 29, 11, 46, 1, + 40, 11, 19, 56, 7, 28, 51, 16, 15, 48, 21, 51, 60, 31, 14, 22, + 41, 47, 59, 56, 53, 28, 58, 26, 43, 27, 41, 33, 24, 52, 44, 38, + 13, 59, 48, 51, 60, 15, 3, 30, 15, 0, 10, 62, 44, 14, 28, 51, + 38, 2, 41, 26, 25, 49, 10, 12, 55, 57, 27, 35, 19, 33, 0, 30, + 5, 36, 47, 53, 5, 53, 20, 43, 34, 37, 52, 41, 21, 63, 59, 9, + 24, 1, 45, 24, 39, 44, 45, 16, 9, 17, 7, 50, 57, 22, 18, 28, + 25, 45, 2, 40, 58, 15, 17, 3, 1, 27, 61, 39, 19, 0, 19, 21, + 57, 62, 54, 60, 54, 40, 48, 33, 36, 37, 4, 42, 1, 43, 58, 8, + 13, 42, 10, 56, 35, 22, 48, 61, 63, 10, 49, 9, 24, 9, 25, 57, + 33, 18, 13, 31, 42, 36, 36, 55, 30, 37, 53, 34, 59, 4, 4, 23, + 8, 16, 58, 14, 30, 11, 12, 63, 49, 62, 2, 39, 47, 22, 2, 60, + 18, 8, 46, 31, 6, 20, 32, 29, 46, 42, 20, 31, 32, 61, 34, 4, + 47, 26, 20, 43, 26, 21, 7, 3, 16, 35, 18, 44, 27, 62, 13, 23, + 6, 50, 12, 8, 45, 17, 3, 46, 50, 7, 14, 5, 17, 54, 38, 0 + }, + { + 29, 56, 5, 7, 54, 48, 23, 37, 35, 44, 52, 40, 33, 49, 60, 0, + 59, 51, 28, 12, 41, 26, 2, 23, 34, 5, 59, 40, 3, 19, 6, 26, + 35, 53, 45, 49, 29, 57, 28, 62, 58, 59, 19, 53, 59, 62, 6, 54, + 13, 15, 48, 50, 45, 21, 41, 12, 34, 40, 24, 56, 19, 21, 35, 18, + 55, 45, 9, 61, 47, 61, 19, 15, 16, 39, 17, 31, 3, 51, 21, 50, + 17, 25, 25, 11, 44, 16, 18, 28, 14, 2, 37, 61, 58, 27, 62, 4, + 14, 17, 1, 9, 46, 28, 37, 0, 53, 43, 57, 7, 57, 46, 21, 41, + 39, 14, 52, 60, 44, 53, 49, 60, 49, 63, 13, 11, 29, 1, 55, 47, + 55, 12, 60, 43, 54, 37, 13, 6, 42, 10, 36, 13, 9, 8, 34, 51, + 31, 32, 12, 7, 57, 2, 26, 14, 3, 30, 63, 3, 32, 1, 5, 11, + 27, 24, 26, 44, 31, 23, 56, 38, 62, 0, 40, 30, 6, 23, 38, 2, + 47, 5, 15, 27, 16, 10, 31, 25, 22, 63, 30, 25, 20, 33, 32, 50, + 29, 43, 55, 10, 50, 45, 56, 20, 4, 7, 27, 46, 11, 16, 22, 52, + 35, 20, 41, 54, 46, 33, 42, 18, 63, 8, 22, 58, 36, 4, 51, 42, + 38, 32, 38, 22, 17, 0, 47, 8, 48, 8, 48, 1, 61, 36, 33, 20, + 24, 39, 39, 18, 30, 36, 9, 43, 42, 24, 10, 58, 4, 15, 34, 52 + }, +}; + +/************************************************************************* + * Offline region structures + *************************************************************************/ + +/** Online group containing number of rules, values, keys and their bins + * for EFD_MAX_GROUP_NUM_RULES rules. + */ +struct efd_offline_group_rules { + uint32_t num_rules; + /**< Sum of the number of rules in all bins assigned to this group. */ + + uint32_t key_idx[EFD_MAX_GROUP_NUM_RULES]; + /**< Array with all keys of the group. */ + efd_value_t value[EFD_MAX_GROUP_NUM_RULES]; + /**< Array with all values of the keys of the group. */ + + uint8_t bin_id[EFD_MAX_GROUP_NUM_RULES]; + /**< Stores the bin for each correspending key to + * avoid having to recompute it + */ +}; + +/** Offline chunk record, containing EFD_TARGET_CHUNK_NUM_RULES rules. + * Those rules are split into EFD_CHUNK_NUM_GROUPS groups per chunk. + */ +struct efd_offline_chunk_rules { + uint16_t num_rules; + /**< Number of rules in the entire chunk; + * used to detect unbalanced groups + */ + + struct efd_offline_group_rules group_rules[EFD_CHUNK_NUM_GROUPS]; + /**< Array of all groups in the chunk. */ +}; + +/************************************************************************* + * Online region structures + *************************************************************************/ + +/** Online group containing values for EFD_MAX_GROUP_NUM_RULES rules. */ +struct efd_online_group_entry { + efd_hashfunc_t hash_idx[RTE_EFD_VALUE_NUM_BITS]; + efd_lookuptbl_t lookup_table[RTE_EFD_VALUE_NUM_BITS]; +} __attribute__((__packed__)); + +/** + * A single chunk record, containing EFD_TARGET_CHUNK_NUM_RULES rules. + * Those rules are split into EFD_CHUNK_NUM_GROUPS groups per chunk. + */ +struct efd_online_chunk { + uint8_t bin_choice_list[(EFD_CHUNK_NUM_BINS * 2 + 7) / 8]; + /**< This is a packed indirection index into the 'groups' array. + * Each byte contains four two-bit values which index into + * the efd_bin_to_group array. + * The efd_bin_to_group array returns the index into the groups array + */ + + struct efd_online_group_entry groups[EFD_CHUNK_NUM_GROUPS]; + /**< Array of all the groups in the chunk. */ +} __attribute__((__packed__)); + +/** + * EFD table structure + */ +struct rte_efd_table { + char name[RTE_EFD_NAMESIZE]; /**< Name of the efd table. */ + + uint32_t key_len; /**< Length of the key stored offline */ + + uint32_t max_num_rules; + /**< Static maximum number of entries the table was constructed to hold. */ + + uint32_t num_rules; + /**< Number of entries currently in the table . */ + + uint32_t num_chunks; + /**< Number of chunks in the table needed to support num_rules. */ + + uint32_t num_chunks_shift; + /**< Bits to shift to get chunk id, instead of dividing by num_chunk. */ + + enum efd_lookup_internal_function lookup_fn; + /**< Indicates which lookup function to use. */ + + struct efd_online_chunk *chunks[RTE_MAX_NUMA_NODES]; + /**< Dynamic array of size num_chunks of chunk records. */ + + struct efd_offline_chunk_rules *offline_chunks; + /**< Dynamic array of size num_chunks of key-value pairs. */ + + struct rte_ring *free_slots; + /**< Ring that stores all indexes of the free slots in the key table */ + + uint8_t *keys; /**< Dynamic array of size max_num_rules of keys */ +}; + +/** + * Computes the chunk ID for a given key hash + * + * @param table + * EFD table to reference + * @param hashed_key + * 32-bit key hash returned by EFD_HASH + * + * @return + * chunk ID containing this key hash + */ +static inline uint32_t +efd_get_chunk_id(const struct rte_efd_table * const table, + const uint32_t hashed_key) +{ + return hashed_key & (table->num_chunks - 1); +} + +/** + * Computes the bin ID for a given key hash + * + * @param table + * EFD table to reference + * @param hashed_key + * 32-bit key hash returned by EFD_HASH + * + * @return bin ID containing this key hash + */ +static inline uint32_t +efd_get_bin_id(const struct rte_efd_table * const table, + const uint32_t hashed_key) +{ + return (hashed_key >> table->num_chunks_shift) & (EFD_CHUNK_NUM_BINS - 1); +} + +/** + * Looks up the current permutation choice for a particular bin in the online table + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to look up existing values (ideally caller's socket id) + * @param chunk_id + * Chunk ID of bin to look up + * @param bin_id + * Bin ID to look up + * + * @return + * Currently active permutation choice in the online table + */ +static inline uint8_t +efd_get_choice(const struct rte_efd_table * const table, + const unsigned int socket_id, const uint32_t chunk_id, + const uint32_t bin_id) +{ + struct efd_online_chunk *chunk = &table->chunks[socket_id][chunk_id]; + + /* + * Grab the chunk (byte) that contains the choices + * for four neighboring bins. + */ + uint8_t choice_chunk = + chunk->bin_choice_list[bin_id / EFD_CHUNK_NUM_BIN_TO_GROUP_SETS]; + + /* + * Compute the offset into the chunk that contains + * the group_id lookup position + */ + int offset = (bin_id & 0x3) * 2; + + /* Extract from the byte just the desired lookup position */ + return (uint8_t) ((choice_chunk >> offset) & 0x3); +} + +/** + * Compute the chunk_id and bin_id for a given key + * + * @param table + * EFD table to reference + * @param key + * Key to hash and find location of + * @param chunk_id + * Computed chunk ID + * @param bin_id + * Computed bin ID + * + */ +static inline void +efd_compute_ids(const struct rte_efd_table * const table, + const void *key, uint32_t * const chunk_id, uint32_t * const bin_id) +{ + /* Compute the position of the entry in the hash table */ + uint32_t h = EFD_HASH(key, table); + + /* Compute the chunk_id where that entry can be found */ + *chunk_id = efd_get_chunk_id(table, h); + + /* + * Compute the bin within that chunk where the entry + * can be found (0 - 255) + */ + *bin_id = efd_get_bin_id(table, h); +} + +/** + * Search for a hash function for a group that satisfies all group results + */ +static inline int +efd_search_hash(struct rte_efd_table * const table, + const struct efd_offline_group_rules * const off_group, + struct efd_online_group_entry * const on_group) +{ + efd_hashfunc_t hash_idx; + efd_hashfunc_t start_hash_idx[RTE_EFD_VALUE_NUM_BITS]; + efd_lookuptbl_t start_lookup_table[RTE_EFD_VALUE_NUM_BITS]; + + uint32_t i, j, rule_id; + uint32_t hash_val_a[EFD_MAX_GROUP_NUM_RULES]; + uint32_t hash_val_b[EFD_MAX_GROUP_NUM_RULES]; + uint32_t hash_val[EFD_MAX_GROUP_NUM_RULES]; + + + rte_prefetch0(off_group->value); + + /* + * Prepopulate the hash_val tables by running the two hash functions + * for each provided rule + */ + for (i = 0; i < off_group->num_rules; i++) { + void *key_stored = EFD_KEY(off_group->key_idx[i], table); + hash_val_b[i] = EFD_HASHFUNCB(key_stored, table); + hash_val_a[i] = EFD_HASHFUNCA(key_stored, table); + } + + for (i = 0; i < RTE_EFD_VALUE_NUM_BITS; i++) { + hash_idx = on_group->hash_idx[i]; + start_hash_idx[i] = hash_idx; + start_lookup_table[i] = on_group->lookup_table[i]; + + do { + efd_lookuptbl_t lookup_table = 0; + efd_lookuptbl_t lookup_table_complement = 0; + + for (rule_id = 0; rule_id < off_group->num_rules; rule_id++) + hash_val[rule_id] = hash_val_a[rule_id] + (hash_idx * + hash_val_b[rule_id]); + + /* + * The goal here is to find a hash function for this + * particular bit entry that meets the following criteria: + * The most significant bits of the hash result define a + * shift into the lookup table where the bit will be stored + */ + + /* Iterate over each provided rule */ + for (rule_id = 0; rule_id < off_group->num_rules; + rule_id++) { + /* + * Use the few most significant bits (number based on + * EFD_LOOKUPTBL_SIZE) to see what position the + * expected bit should be set in the lookup_table + */ + uint32_t bucket_idx = hash_val[rule_id] >> + EFD_LOOKUPTBL_SHIFT; + + /* + * Get the current bit of interest. + * This only find an appropriate hash function + * for one bit at a time of the rule + */ + efd_lookuptbl_t expected = + (off_group->value[rule_id] >> i) & 0x1; + + /* + * Add the expected bit (if set) to a map + * (lookup_table). Also set its complement + * in lookup_table_complement + */ + lookup_table |= expected << bucket_idx; + lookup_table_complement |= (1 - expected) + << bucket_idx; + + /* + * If ever the hash function of two different + * elements result in different values at the + * same location in the lookup_table, + * the current hash_idx is not valid. + */ + if (lookup_table & lookup_table_complement) + break; + } + + /* + * Check if the previous loop completed without + * breaking early + */ + if (rule_id == off_group->num_rules) { + /* + * Current hash function worked, store it + * for the current group + */ + on_group->hash_idx[i] = hash_idx; + on_group->lookup_table[i] = lookup_table; + + /* + * Make sure that the hash function has changed + * from the starting value + */ + hash_idx = start_hash_idx[i] + 1; + break; + } + hash_idx++; + + } while (hash_idx != start_hash_idx[i]); + + /* Failed to find perfect hash for this group */ + if (hash_idx == start_hash_idx[i]) { + /* + * Restore previous hash_idx and lookup_table + * for all value bits + */ + for (j = 0; j < i; j++) { + on_group->hash_idx[j] = start_hash_idx[j]; + on_group->lookup_table[j] = start_lookup_table[j]; + } + return 1; + } + } + + return 0; +} + +struct rte_efd_table * +rte_efd_create(const char *name, uint32_t max_num_rules, uint32_t key_len, + uint8_t online_cpu_socket_bitmask, uint8_t offline_cpu_socket) +{ + struct rte_efd_table *table = NULL; + uint8_t *key_array = NULL; + uint32_t num_chunks, num_chunks_shift; + uint8_t socket_id; + struct rte_efd_list *efd_list = NULL; + struct rte_tailq_entry *te; + uint64_t offline_table_size; + char ring_name[RTE_RING_NAMESIZE]; + struct rte_ring *r = NULL; + unsigned int i; + + efd_list = RTE_TAILQ_CAST(rte_efd_tailq.head, rte_efd_list); + + if (online_cpu_socket_bitmask == 0) { + RTE_LOG(ERR, EFD, "At least one CPU socket must be enabled " + "in the bitmask\n"); + return NULL; + } + + if (max_num_rules == 0) { + RTE_LOG(ERR, EFD, "Max num rules must be higher than 0\n"); + return NULL; + } + + /* + * Compute the minimum number of chunks (smallest power of 2) + * that can hold all of the rules + */ + if (max_num_rules % EFD_TARGET_CHUNK_NUM_RULES == 0) + num_chunks = rte_align32pow2(max_num_rules / + EFD_TARGET_CHUNK_NUM_RULES); + else + num_chunks = rte_align32pow2((max_num_rules / + EFD_TARGET_CHUNK_NUM_RULES) + 1); + + num_chunks_shift = rte_bsf32(num_chunks); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* + * Guarantee there's no existing: this is normally already checked + * by ring creation above + */ + TAILQ_FOREACH(te, efd_list, next) + { + table = (struct rte_efd_table *) te->data; + if (strncmp(name, table->name, RTE_EFD_NAMESIZE) == 0) + break; + } + + table = NULL; + if (te != NULL) { + rte_errno = EEXIST; + te = NULL; + goto error_unlock_exit; + } + + te = rte_zmalloc("EFD_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, EFD, "tailq entry allocation failed\n"); + goto error_unlock_exit; + } + + /* Create a new EFD table management structure */ + table = rte_zmalloc_socket(NULL, + sizeof(struct rte_efd_table), + RTE_CACHE_LINE_SIZE, + offline_cpu_socket); + if (table == NULL) { + RTE_LOG(ERR, EFD, "Allocating EFD table management structure" + " on socket %u failed\n", + offline_cpu_socket); + goto error_unlock_exit; + } + + + RTE_LOG(DEBUG, EFD, "Allocated EFD table management structure " + "on socket %u\n", offline_cpu_socket); + + table->max_num_rules = num_chunks * EFD_TARGET_CHUNK_MAX_NUM_RULES; + table->num_rules = 0; + table->num_chunks = num_chunks; + table->num_chunks_shift = num_chunks_shift; + table->key_len = key_len; + + /* key_array */ + key_array = rte_zmalloc_socket(NULL, + table->max_num_rules * table->key_len, + RTE_CACHE_LINE_SIZE, + offline_cpu_socket); + if (key_array == NULL) { + RTE_LOG(ERR, EFD, "Allocating key array" + " on socket %u failed\n", + offline_cpu_socket); + goto error_unlock_exit; + } + table->keys = key_array; + snprintf(table->name, sizeof(table->name), "%s", name); + + RTE_LOG(DEBUG, EFD, "Creating an EFD table with %u chunks," + " which potentially supports %u entries\n", + num_chunks, table->max_num_rules); + + /* Make sure all the allocatable table pointers are NULL initially */ + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; socket_id++) + table->chunks[socket_id] = NULL; + table->offline_chunks = NULL; + + /* + * Allocate one online table per socket specified + * in the user-supplied bitmask + */ + uint64_t online_table_size = num_chunks * sizeof(struct efd_online_chunk) + + EFD_NUM_CHUNK_PADDING_BYTES; + + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; socket_id++) { + if ((online_cpu_socket_bitmask >> socket_id) & 0x01) { + /* + * Allocate all of the EFD table chunks (the online portion) + * as a continuous block + */ + table->chunks[socket_id] = + rte_zmalloc_socket( + NULL, + online_table_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (table->chunks[socket_id] == NULL) { + RTE_LOG(ERR, EFD, + "Allocating EFD online table on " + "socket %u failed\n", + socket_id); + goto error_unlock_exit; + } + RTE_LOG(DEBUG, EFD, + "Allocated EFD online table of size " + "%"PRIu64" bytes (%.2f MB) on socket %u\n", + online_table_size, + (float) online_table_size / + (1024.0F * 1024.0F), + socket_id); + } + } + +#if defined(RTE_ARCH_X86) + /* + * For less than 4 bits, scalar function performs better + * than vectorised version + */ + if (RTE_EFD_VALUE_NUM_BITS > 3 && rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) + table->lookup_fn = EFD_LOOKUP_AVX2; + else +#endif +#if defined(RTE_ARCH_ARM64) + /* + * For less than or equal to 16 bits, scalar function performs better + * than vectorised version + */ + if (RTE_EFD_VALUE_NUM_BITS > 16 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + table->lookup_fn = EFD_LOOKUP_NEON; + else +#endif + table->lookup_fn = EFD_LOOKUP_SCALAR; + + /* + * Allocate the EFD table offline portion (with the actual rules + * mapping keys to values) as a continuous block. + * This could be several gigabytes of memory. + */ + offline_table_size = num_chunks * sizeof(struct efd_offline_chunk_rules); + table->offline_chunks = + rte_zmalloc_socket(NULL, + offline_table_size, + RTE_CACHE_LINE_SIZE, + offline_cpu_socket); + if (table->offline_chunks == NULL) { + RTE_LOG(ERR, EFD, "Allocating EFD offline table on socket %u " + "failed\n", offline_cpu_socket); + goto error_unlock_exit; + } + + RTE_LOG(DEBUG, EFD, + "Allocated EFD offline table of size %"PRIu64" bytes " + " (%.2f MB) on socket %u\n", offline_table_size, + (float) offline_table_size / (1024.0F * 1024.0F), + offline_cpu_socket); + + te->data = (void *) table; + TAILQ_INSERT_TAIL(efd_list, te, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + snprintf(ring_name, sizeof(ring_name), "HT_%s", table->name); + /* Create ring (Dummy slot index is not enqueued) */ + r = rte_ring_create(ring_name, rte_align32pow2(table->max_num_rules), + offline_cpu_socket, 0); + if (r == NULL) { + RTE_LOG(ERR, EFD, "memory allocation failed\n"); + goto error_unlock_exit; + } + + /* Populate free slots ring. Entry zero is reserved for key misses. */ + for (i = 0; i < table->max_num_rules; i++) + rte_ring_sp_enqueue(r, (void *) ((uintptr_t) i)); + + table->free_slots = r; + return table; + +error_unlock_exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + rte_efd_free(table); + + return NULL; +} + +struct rte_efd_table * +rte_efd_find_existing(const char *name) +{ + struct rte_efd_table *table = NULL; + struct rte_tailq_entry *te; + struct rte_efd_list *efd_list; + + efd_list = RTE_TAILQ_CAST(rte_efd_tailq.head, rte_efd_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + + TAILQ_FOREACH(te, efd_list, next) + { + table = (struct rte_efd_table *) te->data; + if (strncmp(name, table->name, RTE_EFD_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + return table; +} + +void +rte_efd_free(struct rte_efd_table *table) +{ + uint8_t socket_id; + + if (table == NULL) + return; + + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; socket_id++) + rte_free(table->chunks[socket_id]); + + rte_ring_free(table->free_slots); + rte_free(table->offline_chunks); + rte_free(table->keys); + rte_free(table); +} + +/** + * Applies a previously computed table entry to the specified table for all + * socket-local copies of the online table. + * Intended to apply an update for only a single change + * to a key/value pair at a time + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to lookup existing values (ideally caller's socket id) + * @param chunk_id + * Chunk index to update + * @param group_id + * Group index to update + * @param bin_id + * Bin within the group that this update affects + * @param new_bin_choice + * Newly chosen permutation which this bin should use - only lower 2 bits + * @param new_group_entry + * Previously computed updated chunk/group entry + */ +static inline void +efd_apply_update(struct rte_efd_table * const table, const unsigned int socket_id, + const uint32_t chunk_id, const uint32_t group_id, + const uint32_t bin_id, const uint8_t new_bin_choice, + const struct efd_online_group_entry * const new_group_entry) +{ + int i; + struct efd_online_chunk *chunk = &table->chunks[socket_id][chunk_id]; + uint8_t bin_index = bin_id / EFD_CHUNK_NUM_BIN_TO_GROUP_SETS; + + /* + * Grab the current byte that contains the choices + * for four neighboring bins + */ + uint8_t choice_chunk = + chunk->bin_choice_list[bin_index]; + + + /* Compute the offset into the chunk that needs to be updated */ + int offset = (bin_id & 0x3) * 2; + + /* Zero the two bits of interest and set them to new_bin_choice */ + choice_chunk = (choice_chunk & (~(0x03 << offset))) + | ((new_bin_choice & 0x03) << offset); + + /* Update the online table with the new data across all sockets */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { + if (table->chunks[i] != NULL) { + memcpy(&(table->chunks[i][chunk_id].groups[group_id]), + new_group_entry, + sizeof(struct efd_online_group_entry)); + table->chunks[i][chunk_id].bin_choice_list[bin_index] = + choice_chunk; + } + } +} + +/* + * Move the bin from prev group to the new group + */ +static inline void +move_groups(uint32_t bin_id, uint8_t bin_size, + struct efd_offline_group_rules *new_group, + struct efd_offline_group_rules * const current_group) +{ + + uint8_t empty_idx = 0; + unsigned int i; + + if (new_group == current_group) + return; + + for (i = 0; i < current_group->num_rules; i++) { + /* + * Move keys that belong to the same bin + * to the new group + */ + if (current_group->bin_id[i] == bin_id) { + new_group->key_idx[new_group->num_rules] = + current_group->key_idx[i]; + new_group->value[new_group->num_rules] = + current_group->value[i]; + new_group->bin_id[new_group->num_rules] = + current_group->bin_id[i]; + new_group->num_rules++; + } else { + if (i != empty_idx) { + /* + * Need to move this key towards + * the top of the array + */ + current_group->key_idx[empty_idx] = + current_group->key_idx[i]; + current_group->value[empty_idx] = + current_group->value[i]; + current_group->bin_id[empty_idx] = + current_group->bin_id[i]; + } + empty_idx++; + } + + } + current_group->num_rules -= bin_size; +} + +/* + * Revert group/s to their previous state before + * trying to insert/add a new key + */ +static inline void +revert_groups(struct efd_offline_group_rules *previous_group, + struct efd_offline_group_rules *current_group, uint8_t bin_size) +{ + unsigned int i; + + if (current_group == previous_group) + return; + + /* Move keys back to previous group */ + for (i = current_group->num_rules - bin_size; + i < current_group->num_rules; i++) { + previous_group->key_idx[previous_group->num_rules] = + current_group->key_idx[i]; + previous_group->value[previous_group->num_rules] = + current_group->value[i]; + previous_group->bin_id[previous_group->num_rules] = + current_group->bin_id[i]; + previous_group->num_rules++; + } + + /* + * Decrease number of rules after the move + * in the new group + */ + current_group->num_rules -= bin_size; +} + +/** + * Computes an updated table entry where the supplied key points to a new host. + * If no entry exists, one is inserted. + * + * This function does NOT modify the online table(s) + * This function DOES modify the offline table + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to lookup existing values (ideally caller's socket id) + * @param key + * Key to insert + * @param value + * Value to associate with key + * @param chunk_id + * Chunk ID of the chunk that was modified + * @param group_id + * Group ID of the group that was modified + * @param bin_id + * Bin ID that was modified + * @param new_bin_choice + * Newly chosen permutation which this bin will use + * @param entry + * Newly computed online entry to apply later with efd_apply_update + * + * @return + * RTE_EFD_UPDATE_WARN_GROUP_FULL + * Operation is insert, and the last available space in the + * key's group was just used. Future inserts may fail as groups fill up. + * This operation was still successful, and entry contains a valid update + * RTE_EFD_UPDATE_FAILED + * Either the EFD failed to find a suitable perfect hash or the group was full + * This is a fatal error, and the table is now in an indeterminate state + * RTE_EFD_UPDATE_NO_CHANGE + * Operation resulted in no change to the table (same value already exists) + * 0 + * Insert or update was successful, and the new efd_online_group_entry + * is stored in *entry + * + * @warning + * Note that entry will be UNCHANGED if the update has no effect, and thus any + * subsequent use of the entry content will likely be invalid + */ +static inline int +efd_compute_update(struct rte_efd_table * const table, + const unsigned int socket_id, const void *key, + const efd_value_t value, uint32_t * const chunk_id, + uint32_t * const group_id, uint32_t * const bin_id, + uint8_t * const new_bin_choice, + struct efd_online_group_entry * const entry) +{ + unsigned int i; + int ret; + uint32_t new_idx; + void *new_k, *slot_id = NULL; + int status = EXIT_SUCCESS; + unsigned int found = 0; + + efd_compute_ids(table, key, chunk_id, bin_id); + + struct efd_offline_chunk_rules * const chunk = + &table->offline_chunks[*chunk_id]; + struct efd_offline_group_rules *new_group; + + uint8_t current_choice = efd_get_choice(table, socket_id, + *chunk_id, *bin_id); + uint32_t current_group_id = efd_bin_to_group[current_choice][*bin_id]; + struct efd_offline_group_rules * const current_group = + &chunk->group_rules[current_group_id]; + uint8_t bin_size = 0; + uint8_t key_changed_index = 0; + efd_value_t key_changed_previous_value = 0; + uint32_t key_idx_previous = 0; + + /* Scan the current group and see if the key is already present */ + for (i = 0; i < current_group->num_rules; i++) { + if (current_group->bin_id[i] == *bin_id) + bin_size++; + else + continue; + + void *key_stored = EFD_KEY(current_group->key_idx[i], table); + if (found == 0 && unlikely(memcmp(key_stored, key, + table->key_len) == 0)) { + /* Key is already present */ + + /* + * If previous value is same as new value, + * no additional work is required + */ + if (current_group->value[i] == value) + return RTE_EFD_UPDATE_NO_CHANGE; + + key_idx_previous = current_group->key_idx[i]; + key_changed_previous_value = current_group->value[i]; + key_changed_index = i; + current_group->value[i] = value; + found = 1; + } + } + + if (found == 0) { + /* Key does not exist. Insert the rule into the bin/group */ + if (unlikely(current_group->num_rules >= EFD_MAX_GROUP_NUM_RULES)) { + RTE_LOG(ERR, EFD, + "Fatal: No room remaining for insert into " + "chunk %u group %u bin %u\n", + *chunk_id, + current_group_id, *bin_id); + return RTE_EFD_UPDATE_FAILED; + } + + if (unlikely(current_group->num_rules == + (EFD_MAX_GROUP_NUM_RULES - 1))) { + RTE_LOG(INFO, EFD, "Warn: Insert into last " + "available slot in chunk %u " + "group %u bin %u\n", *chunk_id, + current_group_id, *bin_id); + status = RTE_EFD_UPDATE_WARN_GROUP_FULL; + } + + if (rte_ring_sc_dequeue(table->free_slots, &slot_id) != 0) + return RTE_EFD_UPDATE_FAILED; + + new_k = RTE_PTR_ADD(table->keys, (uintptr_t) slot_id * + table->key_len); + rte_prefetch0(new_k); + new_idx = (uint32_t) ((uintptr_t) slot_id); + + rte_memcpy(EFD_KEY(new_idx, table), key, table->key_len); + current_group->key_idx[current_group->num_rules] = new_idx; + current_group->value[current_group->num_rules] = value; + current_group->bin_id[current_group->num_rules] = *bin_id; + current_group->num_rules++; + table->num_rules++; + bin_size++; + } else { + uint32_t last = current_group->num_rules - 1; + /* Swap the key with the last key inserted*/ + current_group->key_idx[key_changed_index] = + current_group->key_idx[last]; + current_group->value[key_changed_index] = + current_group->value[last]; + current_group->bin_id[key_changed_index] = + current_group->bin_id[last]; + + /* + * Key to be updated will always be available + * at the end of the group + */ + current_group->key_idx[last] = key_idx_previous; + current_group->value[last] = value; + current_group->bin_id[last] = *bin_id; + } + + *new_bin_choice = current_choice; + *group_id = current_group_id; + new_group = current_group; + + /* Group need to be rebalanced when it starts to get loaded */ + if (current_group->num_rules > EFD_MIN_BALANCED_NUM_RULES) { + + /* + * Subtract the number of entries in the bin from + * the original group + */ + current_group->num_rules -= bin_size; + + /* + * Figure out which of the available groups that this bin + * can map to is the smallest (using the current group + * as baseline) + */ + uint8_t smallest_choice = current_choice; + uint8_t smallest_size = current_group->num_rules; + uint32_t smallest_group_id = current_group_id; + unsigned char choice; + + for (choice = 0; choice < EFD_CHUNK_NUM_BIN_TO_GROUP_SETS; + choice++) { + uint32_t test_group_id = + efd_bin_to_group[choice][*bin_id]; + uint32_t num_rules = + chunk->group_rules[test_group_id].num_rules; + if (num_rules < smallest_size) { + smallest_choice = choice; + smallest_size = num_rules; + smallest_group_id = test_group_id; + } + } + + *new_bin_choice = smallest_choice; + *group_id = smallest_group_id; + new_group = &chunk->group_rules[smallest_group_id]; + current_group->num_rules += bin_size; + + } + + uint8_t choice = 0; + for (;;) { + if (current_group != new_group && + new_group->num_rules + bin_size > + EFD_MAX_GROUP_NUM_RULES) { + RTE_LOG(DEBUG, EFD, + "Unable to move_groups to dest group " + "containing %u entries." + "bin_size:%u choice:%02x\n", + new_group->num_rules, bin_size, + choice - 1); + goto next_choice; + } + move_groups(*bin_id, bin_size, new_group, current_group); + /* + * Recompute the hash function for the modified group, + * and return it to the caller + */ + ret = efd_search_hash(table, new_group, entry); + + if (!ret) + return status; + + RTE_LOG(DEBUG, EFD, + "Failed to find perfect hash for group " + "containing %u entries. bin_size:%u choice:%02x\n", + new_group->num_rules, bin_size, choice - 1); + /* Restore groups modified to their previous state */ + revert_groups(current_group, new_group, bin_size); + +next_choice: + if (choice == EFD_CHUNK_NUM_BIN_TO_GROUP_SETS) + break; + *new_bin_choice = choice; + *group_id = efd_bin_to_group[choice][*bin_id]; + new_group = &chunk->group_rules[*group_id]; + choice++; + } + + if (!found) { + current_group->num_rules--; + table->num_rules--; + } else + current_group->value[current_group->num_rules - 1] = + key_changed_previous_value; + return RTE_EFD_UPDATE_FAILED; +} + +int +rte_efd_update(struct rte_efd_table * const table, const unsigned int socket_id, + const void *key, const efd_value_t value) +{ + uint32_t chunk_id = 0, group_id = 0, bin_id = 0; + uint8_t new_bin_choice = 0; + struct efd_online_group_entry entry; + + int status = efd_compute_update(table, socket_id, key, value, + &chunk_id, &group_id, &bin_id, + &new_bin_choice, &entry); + + if (status == RTE_EFD_UPDATE_NO_CHANGE) + return EXIT_SUCCESS; + + if (status == RTE_EFD_UPDATE_FAILED) + return status; + + efd_apply_update(table, socket_id, chunk_id, group_id, bin_id, + new_bin_choice, &entry); + return status; +} + +int +rte_efd_delete(struct rte_efd_table * const table, const unsigned int socket_id, + const void *key, efd_value_t * const prev_value) +{ + unsigned int i; + uint32_t chunk_id, bin_id; + uint8_t not_found = 1; + + efd_compute_ids(table, key, &chunk_id, &bin_id); + + struct efd_offline_chunk_rules * const chunk = + &table->offline_chunks[chunk_id]; + + uint8_t current_choice = efd_get_choice(table, socket_id, + chunk_id, bin_id); + uint32_t current_group_id = efd_bin_to_group[current_choice][bin_id]; + struct efd_offline_group_rules * const current_group = + &chunk->group_rules[current_group_id]; + + /* + * Search the current group for the specified key. + * If it exists, remove it and re-pack the other values + */ + for (i = 0; i < current_group->num_rules; i++) { + if (not_found) { + /* Found key that needs to be removed */ + if (memcmp(EFD_KEY(current_group->key_idx[i], table), + key, table->key_len) == 0) { + /* Store previous value if requested by caller */ + if (prev_value != NULL) + *prev_value = current_group->value[i]; + + not_found = 0; + rte_ring_sp_enqueue(table->free_slots, + (void *)((uintptr_t)current_group->key_idx[i])); + } + } else { + /* + * If the desired key has been found, + * need to shift other values up one + */ + + /* Need to shift this entry back up one index */ + current_group->key_idx[i - 1] = current_group->key_idx[i]; + current_group->value[i - 1] = current_group->value[i]; + current_group->bin_id[i - 1] = current_group->bin_id[i]; + } + } + + if (not_found == 0) { + table->num_rules--; + current_group->num_rules--; + } + + return not_found; +} + +static inline efd_value_t +efd_lookup_internal_scalar(const efd_hashfunc_t *group_hash_idx, + const efd_lookuptbl_t *group_lookup_table, + const uint32_t hash_val_a, const uint32_t hash_val_b) +{ + efd_value_t value = 0; + uint32_t i; + + for (i = 0; i < RTE_EFD_VALUE_NUM_BITS; i++) { + value <<= 1; + uint32_t h = hash_val_a + (hash_val_b * + group_hash_idx[RTE_EFD_VALUE_NUM_BITS - i - 1]); + uint16_t bucket_idx = h >> EFD_LOOKUPTBL_SHIFT; + value |= (group_lookup_table[ + RTE_EFD_VALUE_NUM_BITS - i - 1] >> + bucket_idx) & 0x1; + } + + return value; +} + + +static inline efd_value_t +efd_lookup_internal(const struct efd_online_group_entry * const group, + const uint32_t hash_val_a, const uint32_t hash_val_b, + enum efd_lookup_internal_function lookup_fn) +{ + efd_value_t value = 0; + + switch (lookup_fn) { + +#if defined(RTE_ARCH_X86) && defined(CC_SUPPORT_AVX2) + case EFD_LOOKUP_AVX2: + return efd_lookup_internal_avx2(group->hash_idx, + group->lookup_table, + hash_val_a, + hash_val_b); + break; +#endif +#if defined(RTE_ARCH_ARM64) + case EFD_LOOKUP_NEON: + return efd_lookup_internal_neon(group->hash_idx, + group->lookup_table, + hash_val_a, + hash_val_b); + break; +#endif + case EFD_LOOKUP_SCALAR: + /* Fall-through */ + default: + return efd_lookup_internal_scalar(group->hash_idx, + group->lookup_table, + hash_val_a, + hash_val_b); + } + + return value; +} + +efd_value_t +rte_efd_lookup(const struct rte_efd_table * const table, + const unsigned int socket_id, const void *key) +{ + uint32_t chunk_id, group_id, bin_id; + uint8_t bin_choice; + const struct efd_online_group_entry *group; + const struct efd_online_chunk * const chunks = table->chunks[socket_id]; + + /* Determine the chunk and group location for the given key */ + efd_compute_ids(table, key, &chunk_id, &bin_id); + bin_choice = efd_get_choice(table, socket_id, chunk_id, bin_id); + group_id = efd_bin_to_group[bin_choice][bin_id]; + group = &chunks[chunk_id].groups[group_id]; + + return efd_lookup_internal(group, + EFD_HASHFUNCA(key, table), + EFD_HASHFUNCB(key, table), + table->lookup_fn); +} + +void rte_efd_lookup_bulk(const struct rte_efd_table * const table, + const unsigned int socket_id, const int num_keys, + const void **key_list, efd_value_t * const value_list) +{ + int i; + uint32_t chunk_id_list[RTE_EFD_BURST_MAX]; + uint32_t bin_id_list[RTE_EFD_BURST_MAX]; + uint8_t bin_choice_list[RTE_EFD_BURST_MAX]; + uint32_t group_id_list[RTE_EFD_BURST_MAX]; + struct efd_online_group_entry *group; + + struct efd_online_chunk *chunks = table->chunks[socket_id]; + + for (i = 0; i < num_keys; i++) { + efd_compute_ids(table, key_list[i], &chunk_id_list[i], + &bin_id_list[i]); + rte_prefetch0(&chunks[chunk_id_list[i]].bin_choice_list); + } + + for (i = 0; i < num_keys; i++) { + bin_choice_list[i] = efd_get_choice(table, socket_id, + chunk_id_list[i], bin_id_list[i]); + group_id_list[i] = + efd_bin_to_group[bin_choice_list[i]][bin_id_list[i]]; + group = &chunks[chunk_id_list[i]].groups[group_id_list[i]]; + rte_prefetch0(group); + } + + for (i = 0; i < num_keys; i++) { + group = &chunks[chunk_id_list[i]].groups[group_id_list[i]]; + value_list[i] = efd_lookup_internal(group, + EFD_HASHFUNCA(key_list[i], table), + EFD_HASHFUNCB(key_list[i], table), + table->lookup_fn); + } +} diff --git a/src/spdk/dpdk/lib/librte_efd/rte_efd.h b/src/spdk/dpdk/lib/librte_efd/rte_efd.h new file mode 100644 index 00000000..2ace008e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/rte_efd.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Intel Corporation + */ + +#ifndef _RTE_EFD_H_ +#define _RTE_EFD_H_ + +/** + * @file + * + * RTE EFD Table + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/************************************************************************* + * User selectable constants + *************************************************************************/ + +/* + * If possible, best lookup performance will be achieved by ensuring that + * the entire table fits in the L3 cache. + * + * Some formulas for calculating various sizes are listed below: + * + * # of chunks = + * 2 ^ (ceiling(log2((requested # of rules) / + * (EFD_CHUNK_NUM_GROUPS * EFD_TARGET_GROUP_NUM_RULES)))) + * + * Target # of rules = (# of chunks) * EFD_CHUNK_NUM_GROUPS * + * EFD_TARGET_GROUP_NUM_RULES + * + * Group Size (in bytes) = 4 (per value bit) + * + * Table size (in bytes) = RTE_EFD_VALUE_NUM_BITS * (# of chunks) * + * EFD_CHUNK_NUM_GROUPS * (group size) + */ + +/** + * !!! This parameter should be adjusted for your application !!! + * + * This parameter adjusts the number of bits of value that can be + * stored in the table. + * For example, setting the number of bits to 3 will allow storing 8 values + * in the table (between 0 and 7). + * + * This number directly affects the performance of both lookups and insertion. + * In general, performance decreases as more bits are stored in the table. + * + * This number is directly proportional to the size of the online region + * used for lookups. + * + * Note that due to the way the CPU operates on memory, best lookup performance + * will be achieved when RTE_EFD_VALUE_NUM_BITS is a multiple of 8. + * These values align the hash indexes on 16-byte boundaries. + * The greatest performance drop is moving from 8->9 bits, 16->17 bits, etc. + * + * This value must be between 1 and 32 + */ +#ifndef RTE_EFD_VALUE_NUM_BITS +#define RTE_EFD_VALUE_NUM_BITS (8) +#endif + +/* + * EFD_TARGET_GROUP_NUM_RULES: + * Adjusts how many groups/chunks are allocated at table creation time + * to support the requested number of rules. Higher values pack entries + * more tightly in memory, resulting in a smaller memory footprint + * for the online table. + * This comes at the cost of lower insert/update performance. + * + * EFD_MAX_GROUP_NUM_RULES: + * This adjusts the amount of offline memory allocated to store key/value + * pairs for the table. The recommended numbers are upper-bounds for + * this parameter + * - any higher and it becomes very unlikely that a perfect hash function + * can be found for that group size. This value should be at + * least 40% larger than EFD_TARGET_GROUP_NUM_RULES + * + * Recommended values for various lookuptable and hashfunc sizes are: + * + * HASH_FUNC_SIZE = 16, LOOKUPTBL_SIZE = 16: + * EFD_TARGET_GROUP_NUM_RULES = 22 + * EFD_MAX_GROUP_NUM_RULES = 28 + */ +#define EFD_TARGET_GROUP_NUM_RULES (22) +#define EFD_MAX_GROUP_NUM_RULES (28LU) + +#define EFD_MIN_BALANCED_NUM_RULES 5 + +/** + * Maximum number of keys that can be looked up in one call to efd_lookup_bulk + */ +#ifndef RTE_EFD_BURST_MAX +#define RTE_EFD_BURST_MAX (32) +#endif + +/** Maximum number of characters in efd name.*/ +#define RTE_EFD_NAMESIZE 32 + +#if (RTE_EFD_VALUE_NUM_BITS > 0 && RTE_EFD_VALUE_NUM_BITS <= 8) +typedef uint8_t efd_value_t; +#elif (RTE_EFD_VALUE_NUM_BITS > 8 && RTE_EFD_VALUE_NUM_BITS <= 16) +typedef uint16_t efd_value_t; +#elif (RTE_EFD_VALUE_NUM_BITS > 16 && RTE_EFD_VALUE_NUM_BITS <= 32) +typedef uint32_t efd_value_t; +#else +#error("RTE_EFD_VALUE_NUM_BITS must be in the range [1:32]") +#endif + +#define EFD_LOOKUPTBL_SHIFT (32 - 4) +typedef uint16_t efd_lookuptbl_t; +typedef uint16_t efd_hashfunc_t; + +/** + * Creates an EFD table with a single offline region and multiple per-socket + * internally-managed copies of the online table used for lookups + * + * @param name + * EFD table name + * @param max_num_rules + * Minimum number of rules the table should be sized to hold. + * Will be rounded up to the next smallest valid table size + * @param key_len + * Length of the key + * @param online_cpu_socket_bitmask + * Bitmask specifying which sockets should get a copy of the online table. + * LSB = socket 0, etc. + * @param offline_cpu_socket + * Identifies the socket where the offline table will be allocated + * (and most efficiently accessed in the case of updates/insertions) + * + * @return + * EFD table, or NULL if table allocation failed or the bitmask is invalid + */ +struct rte_efd_table * +rte_efd_create(const char *name, uint32_t max_num_rules, uint32_t key_len, + uint8_t online_cpu_socket_bitmask, uint8_t offline_cpu_socket); + +/** + * Releases the resources from an EFD table + * + * @param table + * Table to free + */ +void +rte_efd_free(struct rte_efd_table *table); + +/** + * Find an existing EFD table object and return a pointer to it. + * + * @param name + * Name of the EFD table as passed to rte_efd_create() + * @return + * Pointer to EFD table or NULL if object not found + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - value not available for return + */ +struct rte_efd_table* +rte_efd_find_existing(const char *name); + +#define RTE_EFD_UPDATE_WARN_GROUP_FULL (1) +#define RTE_EFD_UPDATE_NO_CHANGE (2) +#define RTE_EFD_UPDATE_FAILED (3) + +/** + * Computes an updated table entry for the supplied key/value pair. + * The update is then immediately applied to the provided table and + * all socket-local copies of the chunks are updated. + * This operation is not multi-thread safe + * and should only be called one from thread. + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to lookup existing value (ideally caller's socket id) + * @param key + * EFD table key to modify + * @param value + * Value to associate with the key + * + * @return + * RTE_EFD_UPDATE_WARN_GROUP_FULL + * Operation is insert, and the last available space in the + * key's group was just used + * Future inserts may fail as groups fill up + * This operation was still successful, and entry contains a valid update + * RTE_EFD_UPDATE_FAILED + * Either the EFD failed to find a suitable perfect hash or the group was full + * This is a fatal error, and the table is now in an indeterminite state + * RTE_EFD_UPDATE_NO_CHANGE + * Operation resulted in no change to the table (same value already exists) + * 0 - success + */ +int +rte_efd_update(struct rte_efd_table *table, unsigned int socket_id, + const void *key, efd_value_t value); + +/** + * Removes any value currently associated with the specified key from the table + * This operation is not multi-thread safe + * and should only be called from one thread. + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to lookup existing value (ideally caller's socket id) + * @param key + * EFD table key to delete + * @param prev_value + * If not NULL, will store the previous value here before deleting it + * + * @return + * 0 - successfully found and deleted the key + * nonzero otherwise + */ +int +rte_efd_delete(struct rte_efd_table *table, unsigned int socket_id, + const void *key, efd_value_t *prev_value); + +/** + * Looks up the value associated with a key + * This operation is multi-thread safe. + * + * NOTE: Lookups will *always* succeed - this is a property of + * using a perfect hash table. + * If the specified key was never inserted, a pseudorandom answer will be returned. + * There is no way to know based on the lookup if the key was ever inserted + * originally, so this must be tracked elsewhere. + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to lookup existing value (ideally caller's socket id) + * @param key + * EFD table key to look up + * + * @return + * Value associated with the key, or random junk if they key was never inserted + */ +efd_value_t +rte_efd_lookup(const struct rte_efd_table *table, unsigned int socket_id, + const void *key); + +/** + * Looks up the value associated with several keys. + * This operation is multi-thread safe. + * + * NOTE: Lookups will *always* succeed - this is a property of + * using a perfect hash table. + * If the specified key was never inserted, a pseudorandom answer will be returned. + * There is no way to know based on the lookup if the key was ever inserted + * originally, so this must be tracked elsewhere. + * + * @param table + * EFD table to reference + * @param socket_id + * Socket ID to use to lookup existing value (ideally caller's socket id) + * @param num_keys + * Number of keys in the key_list array, must be less than RTE_EFD_BURST_MAX + * @param key_list + * Array of num_keys pointers which point to keys to look up + * @param value_list + * Array of size num_keys where lookup values will be stored + */ +void +rte_efd_lookup_bulk(const struct rte_efd_table *table, unsigned int socket_id, + int num_keys, const void **key_list, + efd_value_t *value_list); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EFD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_efd/rte_efd_arm64.h b/src/spdk/dpdk/lib/librte_efd/rte_efd_arm64.h new file mode 100644 index 00000000..e5678fc9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/rte_efd_arm64.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +/* + * rte_efd_arm64.h + * This file holds all arm64 specific EFD functions + */ + +#ifndef __RTE_EFD_ARM64_H__ +#define __RTE_EFD_ARM64_H__ + +#include + +static inline efd_value_t +efd_lookup_internal_neon(const efd_hashfunc_t *group_hash_idx, + const efd_lookuptbl_t *group_lookup_table, + const uint32_t hash_val_a, const uint32_t hash_val_b) +{ + efd_value_t value = 0; + uint32_t i = 0; + uint32x4_t vhash_val_a = vmovq_n_u32(hash_val_a); + uint32x4_t vhash_val_b = vmovq_n_u32(hash_val_b); + int32x4_t vshift = {0, 1, 2, 3}; + uint32x4_t vmask = vdupq_n_u32(0x1); + int32x4_t vincr = vdupq_n_s32(4); + + for (; i < RTE_EFD_VALUE_NUM_BITS; i += 4) { + uint32x4_t vhash_idx = vshll_n_u16( + vld1_u16((uint16_t const *)&group_hash_idx[i]), 0); + uint32x4_t vlookup_table = vshll_n_u16( + vld1_u16((uint16_t const *)&group_lookup_table[i]), 0); + uint32x4_t vhash = vaddq_u32(vhash_val_a, + vmulq_u32(vhash_idx, vhash_val_b)); + int32x4_t vbucket_idx = vnegq_s32(vreinterpretq_s32_u32( + vshrq_n_u32(vhash, EFD_LOOKUPTBL_SHIFT))); + uint32x4_t vresult = vshlq_u32(vlookup_table, vbucket_idx); + + vresult = vandq_u32(vresult, vmask); + vresult = vshlq_u32(vresult, vshift); + value |= vaddvq_u32(vresult); + vshift = vaddq_s32(vshift, vincr); + } + + return value; +} + +#endif /* __RTE_EFD_ARM64_H__ */ diff --git a/src/spdk/dpdk/lib/librte_efd/rte_efd_version.map b/src/spdk/dpdk/lib/librte_efd/rte_efd_version.map new file mode 100644 index 00000000..ae60a641 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/rte_efd_version.map @@ -0,0 +1,13 @@ +DPDK_17.02 { + global: + + rte_efd_create; + rte_efd_delete; + rte_efd_find_existing; + rte_efd_free; + rte_efd_lookup; + rte_efd_lookup_bulk; + rte_efd_update; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_efd/rte_efd_x86.h b/src/spdk/dpdk/lib/librte_efd/rte_efd_x86.h new file mode 100644 index 00000000..6c207e87 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_efd/rte_efd_x86.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Intel Corporation + */ + +/* rte_efd_x86.h + * This file holds all x86 specific EFD functions + */ +#include + +#if (RTE_EFD_VALUE_NUM_BITS == 8 || RTE_EFD_VALUE_NUM_BITS == 16 || \ + RTE_EFD_VALUE_NUM_BITS == 24 || RTE_EFD_VALUE_NUM_BITS == 32) +#define EFD_LOAD_SI128(val) _mm_load_si128(val) +#else +#define EFD_LOAD_SI128(val) _mm_lddqu_si128(val) +#endif + +static inline efd_value_t +efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx, + const efd_lookuptbl_t *group_lookup_table, + const uint32_t hash_val_a, const uint32_t hash_val_b) +{ +#ifdef RTE_MACHINE_CPUFLAG_AVX2 + efd_value_t value = 0; + uint32_t i = 0; + __m256i vhash_val_a = _mm256_set1_epi32(hash_val_a); + __m256i vhash_val_b = _mm256_set1_epi32(hash_val_b); + + for (; i < RTE_EFD_VALUE_NUM_BITS; i += 8) { + __m256i vhash_idx = + _mm256_cvtepu16_epi32(EFD_LOAD_SI128( + (__m128i const *) &group_hash_idx[i])); + __m256i vlookup_table = _mm256_cvtepu16_epi32( + EFD_LOAD_SI128((__m128i const *) + &group_lookup_table[i])); + __m256i vhash = _mm256_add_epi32(vhash_val_a, + _mm256_mullo_epi32(vhash_idx, vhash_val_b)); + __m256i vbucket_idx = _mm256_srli_epi32(vhash, + EFD_LOOKUPTBL_SHIFT); + __m256i vresult = _mm256_srlv_epi32(vlookup_table, + vbucket_idx); + + value |= (_mm256_movemask_ps( + (__m256) _mm256_slli_epi32(vresult, 31)) + & ((1 << (RTE_EFD_VALUE_NUM_BITS - i)) - 1)) << i; + } + + return value; +#else + RTE_SET_USED(group_hash_idx); + RTE_SET_USED(group_lookup_table); + RTE_SET_USED(hash_val_a); + RTE_SET_USED(hash_val_b); + /* Return dummy value, only to avoid compilation breakage */ + return 0; +#endif + +} diff --git a/src/spdk/dpdk/lib/librte_ethdev/Makefile b/src/spdk/dpdk/lib/librte_ethdev/Makefile new file mode 100644 index 00000000..0935a275 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/Makefile @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_ethdev.a + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_net -lrte_eal -lrte_mempool -lrte_ring +LDLIBS += -lrte_mbuf + +EXPORT_MAP := rte_ethdev_version.map + +LIBABIVER := 10 + +SRCS-y += rte_ethdev.c +SRCS-y += rte_flow.c +SRCS-y += rte_tm.c +SRCS-y += rte_mtr.c +SRCS-y += ethdev_profile.c + +# +# Export include files +# +SYMLINK-y-include += rte_ethdev.h +SYMLINK-y-include += rte_ethdev_driver.h +SYMLINK-y-include += rte_ethdev_core.h +SYMLINK-y-include += rte_ethdev_pci.h +SYMLINK-y-include += rte_ethdev_vdev.h +SYMLINK-y-include += rte_eth_ctrl.h +SYMLINK-y-include += rte_dev_info.h +SYMLINK-y-include += rte_flow.h +SYMLINK-y-include += rte_flow_driver.h +SYMLINK-y-include += rte_tm.h +SYMLINK-y-include += rte_tm_driver.h +SYMLINK-y-include += rte_mtr.h +SYMLINK-y-include += rte_mtr_driver.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.c b/src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.c new file mode 100644 index 00000000..0d1dcda3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.c @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include "ethdev_profile.h" + +/** + * This conditional block enables RX queues profiling by tracking wasted + * iterations, i.e. iterations which yielded no RX packets. Profiling is + * performed using the Instrumentation and Tracing Technology (ITT) API, + * employed by the Intel (R) VTune (TM) Amplifier. + */ +#ifdef RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS + +#include + +#define ITT_MAX_NAME_LEN (100) + +/** + * Auxiliary ITT structure belonging to Ethernet device and using to: + * - track RX queue state to determine whether it is wasting loop iterations + * - begin or end ITT task using task domain and task name (handle) + */ +struct itt_profile_rx_data { + /** + * ITT domains for each queue. + */ + __itt_domain *domains[RTE_MAX_QUEUES_PER_PORT]; + /** + * ITT task names for each queue. + */ + __itt_string_handle *handles[RTE_MAX_QUEUES_PER_PORT]; + /** + * Flags indicating the queues state. Possible values: + * 1 - queue is wasting iterations, + * 0 - otherwise. + */ + uint8_t queue_state[RTE_MAX_QUEUES_PER_PORT]; +}; + +/** + * The pool of *itt_profile_rx_data* structures. + */ +struct itt_profile_rx_data itt_rx_data[RTE_MAX_ETHPORTS]; + + +/** + * This callback function manages ITT tasks collection on given port and queue. + * It must be registered with rte_eth_add_rx_callback() to be called from + * rte_eth_rx_burst(). To find more comments see rte_rx_callback_fn function + * type declaration. + */ +static uint16_t +collect_itt_rx_burst_cb(uint16_t port_id, uint16_t queue_id, + __rte_unused struct rte_mbuf *pkts[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, __rte_unused void *user_param) +{ + if (unlikely(nb_pkts == 0)) { + if (!itt_rx_data[port_id].queue_state[queue_id]) { + __itt_task_begin( + itt_rx_data[port_id].domains[queue_id], + __itt_null, __itt_null, + itt_rx_data[port_id].handles[queue_id]); + itt_rx_data[port_id].queue_state[queue_id] = 1; + } + } else { + if (unlikely(itt_rx_data[port_id].queue_state[queue_id])) { + __itt_task_end( + itt_rx_data[port_id].domains[queue_id]); + itt_rx_data[port_id].queue_state[queue_id] = 0; + } + } + return nb_pkts; +} + +/** + * Initialization of itt_profile_rx_data for a given Ethernet device. + * This function must be invoked when ethernet device is being configured. + * Result will be stored in the global array *itt_rx_data*. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param port_name + * The name of the Ethernet device. + * @param rx_queue_num + * The number of RX queues on specified port. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +static inline int +itt_profile_rx_init(uint16_t port_id, char *port_name, uint8_t rx_queue_num) +{ + uint16_t q_id; + + for (q_id = 0; q_id < rx_queue_num; ++q_id) { + char domain_name[ITT_MAX_NAME_LEN]; + + snprintf(domain_name, sizeof(domain_name), + "RXBurst.WastedIterations.Port_%s.Queue_%d", + port_name, q_id); + itt_rx_data[port_id].domains[q_id] + = __itt_domain_create(domain_name); + + char task_name[ITT_MAX_NAME_LEN]; + + snprintf(task_name, sizeof(task_name), + "port id: %d; queue id: %d", + port_id, q_id); + itt_rx_data[port_id].handles[q_id] + = __itt_string_handle_create(task_name); + + itt_rx_data[port_id].queue_state[q_id] = 0; + + if (!rte_eth_add_rx_callback( + port_id, q_id, collect_itt_rx_burst_cb, NULL)) { + return -rte_errno; + } + } + + return 0; +} +#endif /* RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS */ + +int +__rte_eth_profile_rx_init(__rte_unused uint16_t port_id, + __rte_unused struct rte_eth_dev *dev) +{ +#ifdef RTE_ETHDEV_PROFILE_ITT_WASTED_RX_ITERATIONS + return itt_profile_rx_init( + port_id, dev->data->name, dev->data->nb_rx_queues); +#endif + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.h b/src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.h new file mode 100644 index 00000000..e5ea3682 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/ethdev_profile.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_ETHDEV_PROFILE_H_ +#define _RTE_ETHDEV_PROFILE_H_ + +#include "rte_ethdev.h" + +/** + * Initialization of profiling RX queues for the Ethernet device. + * Implementation of this function depends on chosen profiling method, + * defined in configs. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param dev + * Pointer to struct rte_eth_dev corresponding to given port_id. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +__rte_eth_profile_rx_init(uint16_t port_id, struct rte_eth_dev *dev); + +#endif diff --git a/src/spdk/dpdk/lib/librte_ethdev/meson.build b/src/spdk/dpdk/lib/librte_ethdev/meson.build new file mode 100644 index 00000000..596cd0f3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/meson.build @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +name = 'ethdev' +version = 10 +allow_experimental_apis = true +sources = files('ethdev_profile.c', + 'rte_ethdev.c', + 'rte_flow.c', + 'rte_mtr.c', + 'rte_tm.c') + +headers = files('rte_ethdev.h', + 'rte_ethdev_driver.h', + 'rte_ethdev_core.h', + 'rte_ethdev_pci.h', + 'rte_ethdev_vdev.h', + 'rte_eth_ctrl.h', + 'rte_dev_info.h', + 'rte_flow.h', + 'rte_flow_driver.h', + 'rte_mtr.h', + 'rte_mtr_driver.h', + 'rte_tm.h', + 'rte_tm_driver.h') + +deps += ['net', 'kvargs'] diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_dev_info.h b/src/spdk/dpdk/lib/librte_ethdev/rte_dev_info.h new file mode 100644 index 00000000..fea5da88 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_dev_info.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#ifndef _RTE_DEV_INFO_H_ +#define _RTE_DEV_INFO_H_ + +#include + +/* + * Placeholder for accessing device registers + */ +struct rte_dev_reg_info { + void *data; /**< Buffer for return registers */ + uint32_t offset; /**< Start register table location for access */ + uint32_t length; /**< Number of registers to fetch */ + uint32_t width; /**< Size of device register */ + uint32_t version; /**< Device version */ +}; + +/* + * Placeholder for accessing device eeprom + */ +struct rte_dev_eeprom_info { + void *data; /**< Buffer for return eeprom */ + uint32_t offset; /**< Start eeprom address for access*/ + uint32_t length; /**< Length of eeprom region to access */ + uint32_t magic; /**< Device-specific key, such as device-id */ +}; + +/** + * Placeholder for accessing plugin module eeprom + */ +struct rte_eth_dev_module_info { + uint32_t type; /**< Type of plugin module eeprom */ + uint32_t eeprom_len; /**< Length of plugin module eeprom */ +}; + +/* EEPROM Standards for plug in modules */ +#define RTE_ETH_MODULE_SFF_8079 0x1 +#define RTE_ETH_MODULE_SFF_8079_LEN 256 +#define RTE_ETH_MODULE_SFF_8472 0x2 +#define RTE_ETH_MODULE_SFF_8472_LEN 512 +#define RTE_ETH_MODULE_SFF_8636 0x3 +#define RTE_ETH_MODULE_SFF_8636_LEN 256 +#define RTE_ETH_MODULE_SFF_8436 0x4 +#define RTE_ETH_MODULE_SFF_8436_LEN 256 + +#endif /* _RTE_DEV_INFO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_eth_ctrl.h b/src/spdk/dpdk/lib/librte_ethdev/rte_eth_ctrl.h new file mode 100644 index 00000000..5ea8ae24 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_eth_ctrl.h @@ -0,0 +1,828 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_ETH_CTRL_H_ +#define _RTE_ETH_CTRL_H_ + +#include +#include +#include "rte_ether.h" + +/** + * @file + * + * Ethernet device features and related data structures used + * by control APIs should be defined in this file. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A packet can be identified by hardware as different flow types. Different + * NIC hardwares may support different flow types. + * Basically, the NIC hardware identifies the flow type as deep protocol as + * possible, and exclusively. For example, if a packet is identified as + * 'RTE_ETH_FLOW_NONFRAG_IPV4_TCP', it will not be any of other flow types, + * though it is an actual IPV4 packet. + * Note that the flow types are used to define RSS offload types in + * rte_ethdev.h. + */ +#define RTE_ETH_FLOW_UNKNOWN 0 +#define RTE_ETH_FLOW_RAW 1 +#define RTE_ETH_FLOW_IPV4 2 +#define RTE_ETH_FLOW_FRAG_IPV4 3 +#define RTE_ETH_FLOW_NONFRAG_IPV4_TCP 4 +#define RTE_ETH_FLOW_NONFRAG_IPV4_UDP 5 +#define RTE_ETH_FLOW_NONFRAG_IPV4_SCTP 6 +#define RTE_ETH_FLOW_NONFRAG_IPV4_OTHER 7 +#define RTE_ETH_FLOW_IPV6 8 +#define RTE_ETH_FLOW_FRAG_IPV6 9 +#define RTE_ETH_FLOW_NONFRAG_IPV6_TCP 10 +#define RTE_ETH_FLOW_NONFRAG_IPV6_UDP 11 +#define RTE_ETH_FLOW_NONFRAG_IPV6_SCTP 12 +#define RTE_ETH_FLOW_NONFRAG_IPV6_OTHER 13 +#define RTE_ETH_FLOW_L2_PAYLOAD 14 +#define RTE_ETH_FLOW_IPV6_EX 15 +#define RTE_ETH_FLOW_IPV6_TCP_EX 16 +#define RTE_ETH_FLOW_IPV6_UDP_EX 17 +#define RTE_ETH_FLOW_PORT 18 + /**< Consider device port number as a flow differentiator */ +#define RTE_ETH_FLOW_VXLAN 19 /**< VXLAN protocol based flow */ +#define RTE_ETH_FLOW_GENEVE 20 /**< GENEVE protocol based flow */ +#define RTE_ETH_FLOW_NVGRE 21 /**< NVGRE protocol based flow */ +#define RTE_ETH_FLOW_VXLAN_GPE 22 /**< VXLAN-GPE protocol based flow */ +#define RTE_ETH_FLOW_MAX 23 + +/** + * Feature filter types + */ +enum rte_filter_type { + RTE_ETH_FILTER_NONE = 0, + RTE_ETH_FILTER_MACVLAN, + RTE_ETH_FILTER_ETHERTYPE, + RTE_ETH_FILTER_FLEXIBLE, + RTE_ETH_FILTER_SYN, + RTE_ETH_FILTER_NTUPLE, + RTE_ETH_FILTER_TUNNEL, + RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_HASH, + RTE_ETH_FILTER_L2_TUNNEL, + RTE_ETH_FILTER_GENERIC, + RTE_ETH_FILTER_MAX +}; + +/** + * Generic operations on filters + */ +enum rte_filter_op { + /** used to check whether the type filter is supported */ + RTE_ETH_FILTER_NOP = 0, + RTE_ETH_FILTER_ADD, /**< add filter entry */ + RTE_ETH_FILTER_UPDATE, /**< update filter entry */ + RTE_ETH_FILTER_DELETE, /**< delete filter entry */ + RTE_ETH_FILTER_FLUSH, /**< flush all entries */ + RTE_ETH_FILTER_GET, /**< get filter entry */ + RTE_ETH_FILTER_SET, /**< configurations */ + RTE_ETH_FILTER_INFO, /**< retrieve information */ + RTE_ETH_FILTER_STATS, /**< retrieve statistics */ + RTE_ETH_FILTER_OP_MAX +}; + +/** + * MAC filter type + */ +enum rte_mac_filter_type { + RTE_MAC_PERFECT_MATCH = 1, /**< exact match of MAC addr. */ + RTE_MACVLAN_PERFECT_MATCH, /**< exact match of MAC addr and VLAN ID. */ + RTE_MAC_HASH_MATCH, /**< hash match of MAC addr. */ + /** hash match of MAC addr and exact match of VLAN ID. */ + RTE_MACVLAN_HASH_MATCH, +}; + +/** + * MAC filter info + */ +struct rte_eth_mac_filter { + uint8_t is_vf; /**< 1 for VF, 0 for port dev */ + uint16_t dst_id; /**< VF ID, available when is_vf is 1*/ + enum rte_mac_filter_type filter_type; /**< MAC filter type */ + struct ether_addr mac_addr; +}; + +/** + * Define all structures for Ethertype Filter type. + */ + +#define RTE_ETHTYPE_FLAGS_MAC 0x0001 /**< If set, compare mac */ +#define RTE_ETHTYPE_FLAGS_DROP 0x0002 /**< If set, drop packet when match */ + +/** + * A structure used to define the ethertype filter entry + * to support RTE_ETH_FILTER_ETHERTYPE with RTE_ETH_FILTER_ADD, + * RTE_ETH_FILTER_DELETE and RTE_ETH_FILTER_GET operations. + */ +struct rte_eth_ethertype_filter { + struct ether_addr mac_addr; /**< Mac address to match. */ + uint16_t ether_type; /**< Ether type to match */ + uint16_t flags; /**< Flags from RTE_ETHTYPE_FLAGS_* */ + uint16_t queue; /**< Queue assigned to when match*/ +}; + +#define RTE_FLEX_FILTER_MAXLEN 128 /**< bytes to use in flex filter. */ +#define RTE_FLEX_FILTER_MASK_SIZE \ + (RTE_ALIGN(RTE_FLEX_FILTER_MAXLEN, CHAR_BIT) / CHAR_BIT) + /**< mask bytes in flex filter. */ + +/** + * A structure used to define the flex filter entry + * to support RTE_ETH_FILTER_FLEXIBLE with RTE_ETH_FILTER_ADD, + * RTE_ETH_FILTER_DELETE and RTE_ETH_FILTER_GET operations. + */ +struct rte_eth_flex_filter { + uint16_t len; + uint8_t bytes[RTE_FLEX_FILTER_MAXLEN]; /**< flex bytes in big endian.*/ + uint8_t mask[RTE_FLEX_FILTER_MASK_SIZE]; /**< if mask bit is 1b, do + not compare corresponding byte. */ + uint8_t priority; + uint16_t queue; /**< Queue assigned to when match. */ +}; + +/** + * A structure used to define the TCP syn filter entry + * to support RTE_ETH_FILTER_SYN with RTE_ETH_FILTER_ADD, + * RTE_ETH_FILTER_DELETE and RTE_ETH_FILTER_GET operations. + */ +struct rte_eth_syn_filter { + uint8_t hig_pri; /**< 1 - higher priority than other filters, + 0 - lower priority. */ + uint16_t queue; /**< Queue assigned to when match */ +}; + +/** + * Define all structures for ntuple Filter type. + */ + +#define RTE_NTUPLE_FLAGS_DST_IP 0x0001 /**< If set, dst_ip is part of ntuple */ +#define RTE_NTUPLE_FLAGS_SRC_IP 0x0002 /**< If set, src_ip is part of ntuple */ +#define RTE_NTUPLE_FLAGS_DST_PORT 0x0004 /**< If set, dst_port is part of ntuple */ +#define RTE_NTUPLE_FLAGS_SRC_PORT 0x0008 /**< If set, src_port is part of ntuple */ +#define RTE_NTUPLE_FLAGS_PROTO 0x0010 /**< If set, protocol is part of ntuple */ +#define RTE_NTUPLE_FLAGS_TCP_FLAG 0x0020 /**< If set, tcp flag is involved */ + +#define RTE_5TUPLE_FLAGS ( \ + RTE_NTUPLE_FLAGS_DST_IP | \ + RTE_NTUPLE_FLAGS_SRC_IP | \ + RTE_NTUPLE_FLAGS_DST_PORT | \ + RTE_NTUPLE_FLAGS_SRC_PORT | \ + RTE_NTUPLE_FLAGS_PROTO) + +#define RTE_2TUPLE_FLAGS ( \ + RTE_NTUPLE_FLAGS_DST_PORT | \ + RTE_NTUPLE_FLAGS_PROTO) + +#define TCP_URG_FLAG 0x20 +#define TCP_ACK_FLAG 0x10 +#define TCP_PSH_FLAG 0x08 +#define TCP_RST_FLAG 0x04 +#define TCP_SYN_FLAG 0x02 +#define TCP_FIN_FLAG 0x01 +#define TCP_FLAG_ALL 0x3F + +/** + * A structure used to define the ntuple filter entry + * to support RTE_ETH_FILTER_NTUPLE with RTE_ETH_FILTER_ADD, + * RTE_ETH_FILTER_DELETE and RTE_ETH_FILTER_GET operations. + */ +struct rte_eth_ntuple_filter { + uint16_t flags; /**< Flags from RTE_NTUPLE_FLAGS_* */ + uint32_t dst_ip; /**< Destination IP address in big endian. */ + uint32_t dst_ip_mask; /**< Mask of destination IP address. */ + uint32_t src_ip; /**< Source IP address in big endian. */ + uint32_t src_ip_mask; /**< Mask of destination IP address. */ + uint16_t dst_port; /**< Destination port in big endian. */ + uint16_t dst_port_mask; /**< Mask of destination port. */ + uint16_t src_port; /**< Source Port in big endian. */ + uint16_t src_port_mask; /**< Mask of source port. */ + uint8_t proto; /**< L4 protocol. */ + uint8_t proto_mask; /**< Mask of L4 protocol. */ + /** tcp_flags only meaningful when the proto is TCP. + The packet matched above ntuple fields and contain + any set bit in tcp_flags will hit this filter. */ + uint8_t tcp_flags; + uint16_t priority; /**< seven levels (001b-111b), 111b is highest, + used when more than one filter matches. */ + uint16_t queue; /**< Queue assigned to when match*/ +}; + +/** + * Tunneled type. + */ +enum rte_eth_tunnel_type { + RTE_TUNNEL_TYPE_NONE = 0, + RTE_TUNNEL_TYPE_VXLAN, + RTE_TUNNEL_TYPE_GENEVE, + RTE_TUNNEL_TYPE_TEREDO, + RTE_TUNNEL_TYPE_NVGRE, + RTE_TUNNEL_TYPE_IP_IN_GRE, + RTE_L2_TUNNEL_TYPE_E_TAG, + RTE_TUNNEL_TYPE_MAX, +}; + +/** + * filter type of tunneling packet + */ +#define ETH_TUNNEL_FILTER_OMAC 0x01 /**< filter by outer MAC addr */ +#define ETH_TUNNEL_FILTER_OIP 0x02 /**< filter by outer IP Addr */ +#define ETH_TUNNEL_FILTER_TENID 0x04 /**< filter by tenant ID */ +#define ETH_TUNNEL_FILTER_IMAC 0x08 /**< filter by inner MAC addr */ +#define ETH_TUNNEL_FILTER_IVLAN 0x10 /**< filter by inner VLAN ID */ +#define ETH_TUNNEL_FILTER_IIP 0x20 /**< filter by inner IP addr */ + +#define RTE_TUNNEL_FILTER_IMAC_IVLAN (ETH_TUNNEL_FILTER_IMAC | \ + ETH_TUNNEL_FILTER_IVLAN) +#define RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID (ETH_TUNNEL_FILTER_IMAC | \ + ETH_TUNNEL_FILTER_IVLAN | \ + ETH_TUNNEL_FILTER_TENID) +#define RTE_TUNNEL_FILTER_IMAC_TENID (ETH_TUNNEL_FILTER_IMAC | \ + ETH_TUNNEL_FILTER_TENID) +#define RTE_TUNNEL_FILTER_OMAC_TENID_IMAC (ETH_TUNNEL_FILTER_OMAC | \ + ETH_TUNNEL_FILTER_TENID | \ + ETH_TUNNEL_FILTER_IMAC) + +/** + * Select IPv4 or IPv6 for tunnel filters. + */ +enum rte_tunnel_iptype { + RTE_TUNNEL_IPTYPE_IPV4 = 0, /**< IPv4. */ + RTE_TUNNEL_IPTYPE_IPV6, /**< IPv6. */ +}; + +/** + * Tunneling Packet filter configuration. + */ +struct rte_eth_tunnel_filter_conf { + struct ether_addr outer_mac; /**< Outer MAC address to match. */ + struct ether_addr inner_mac; /**< Inner MAC address to match. */ + uint16_t inner_vlan; /**< Inner VLAN to match. */ + enum rte_tunnel_iptype ip_type; /**< IP address type. */ + /** Outer destination IP address to match if ETH_TUNNEL_FILTER_OIP + is set in filter_type, or inner destination IP address to match + if ETH_TUNNEL_FILTER_IIP is set in filter_type . */ + union { + uint32_t ipv4_addr; /**< IPv4 address in big endian. */ + uint32_t ipv6_addr[4]; /**< IPv6 address in big endian. */ + } ip_addr; + /** Flags from ETH_TUNNEL_FILTER_XX - see above. */ + uint16_t filter_type; + enum rte_eth_tunnel_type tunnel_type; /**< Tunnel Type. */ + uint32_t tenant_id; /**< Tenant ID to match. VNI, GRE key... */ + uint16_t queue_id; /**< Queue assigned to if match. */ +}; + +/** + * Global eth device configuration type. + */ +enum rte_eth_global_cfg_type { + RTE_ETH_GLOBAL_CFG_TYPE_UNKNOWN = 0, + RTE_ETH_GLOBAL_CFG_TYPE_GRE_KEY_LEN, + RTE_ETH_GLOBAL_CFG_TYPE_MAX, +}; + +/** + * Global eth device configuration. + */ +struct rte_eth_global_cfg { + enum rte_eth_global_cfg_type cfg_type; /**< Global config type. */ + union { + uint8_t gre_key_len; /**< Valid GRE key length in byte. */ + uint64_t reserved; /**< Reserve space for future use. */ + } cfg; +}; + +#define RTE_ETH_FDIR_MAX_FLEXLEN 16 /**< Max length of flexbytes. */ +#define RTE_ETH_INSET_SIZE_MAX 128 /**< Max length of input set. */ + +/** + * Input set fields for Flow Director and Hash filters + */ +enum rte_eth_input_set_field { + RTE_ETH_INPUT_SET_UNKNOWN = 0, + + /* L2 */ + RTE_ETH_INPUT_SET_L2_SRC_MAC = 1, + RTE_ETH_INPUT_SET_L2_DST_MAC, + RTE_ETH_INPUT_SET_L2_OUTER_VLAN, + RTE_ETH_INPUT_SET_L2_INNER_VLAN, + RTE_ETH_INPUT_SET_L2_ETHERTYPE, + + /* L3 */ + RTE_ETH_INPUT_SET_L3_SRC_IP4 = 129, + RTE_ETH_INPUT_SET_L3_DST_IP4, + RTE_ETH_INPUT_SET_L3_SRC_IP6, + RTE_ETH_INPUT_SET_L3_DST_IP6, + RTE_ETH_INPUT_SET_L3_IP4_TOS, + RTE_ETH_INPUT_SET_L3_IP4_PROTO, + RTE_ETH_INPUT_SET_L3_IP6_TC, + RTE_ETH_INPUT_SET_L3_IP6_NEXT_HEADER, + RTE_ETH_INPUT_SET_L3_IP4_TTL, + RTE_ETH_INPUT_SET_L3_IP6_HOP_LIMITS, + + /* L4 */ + RTE_ETH_INPUT_SET_L4_UDP_SRC_PORT = 257, + RTE_ETH_INPUT_SET_L4_UDP_DST_PORT, + RTE_ETH_INPUT_SET_L4_TCP_SRC_PORT, + RTE_ETH_INPUT_SET_L4_TCP_DST_PORT, + RTE_ETH_INPUT_SET_L4_SCTP_SRC_PORT, + RTE_ETH_INPUT_SET_L4_SCTP_DST_PORT, + RTE_ETH_INPUT_SET_L4_SCTP_VERIFICATION_TAG, + + /* Tunnel */ + RTE_ETH_INPUT_SET_TUNNEL_L2_INNER_DST_MAC = 385, + RTE_ETH_INPUT_SET_TUNNEL_L2_INNER_SRC_MAC, + RTE_ETH_INPUT_SET_TUNNEL_L2_INNER_VLAN, + RTE_ETH_INPUT_SET_TUNNEL_L4_UDP_KEY, + RTE_ETH_INPUT_SET_TUNNEL_GRE_KEY, + + /* Flexible Payload */ + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_1ST_WORD = 641, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_2ND_WORD, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_3RD_WORD, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_4TH_WORD, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_5TH_WORD, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_6TH_WORD, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_7TH_WORD, + RTE_ETH_INPUT_SET_FLEX_PAYLOAD_8TH_WORD, + + RTE_ETH_INPUT_SET_DEFAULT = 65533, + RTE_ETH_INPUT_SET_NONE = 65534, + RTE_ETH_INPUT_SET_MAX = 65535, +}; + +/** + * Filters input set operations + */ +enum rte_filter_input_set_op { + RTE_ETH_INPUT_SET_OP_UNKNOWN, + RTE_ETH_INPUT_SET_SELECT, /**< select input set */ + RTE_ETH_INPUT_SET_ADD, /**< add input set entry */ + RTE_ETH_INPUT_SET_OP_MAX +}; + + +/** + * A structure used to define the input set configuration for + * flow director and hash filters + */ +struct rte_eth_input_set_conf { + uint16_t flow_type; + uint16_t inset_size; + enum rte_eth_input_set_field field[RTE_ETH_INSET_SIZE_MAX]; + enum rte_filter_input_set_op op; +}; + +/** + * A structure used to define the input for L2 flow + */ +struct rte_eth_l2_flow { + uint16_t ether_type; /**< Ether type in big endian */ +}; + +/** + * A structure used to define the input for IPV4 flow + */ +struct rte_eth_ipv4_flow { + uint32_t src_ip; /**< IPv4 source address in big endian. */ + uint32_t dst_ip; /**< IPv4 destination address in big endian. */ + uint8_t tos; /**< Type of service to match. */ + uint8_t ttl; /**< Time to live to match. */ + uint8_t proto; /**< Protocol, next header in big endian. */ +}; + +/** + * A structure used to define the input for IPV4 UDP flow + */ +struct rte_eth_udpv4_flow { + struct rte_eth_ipv4_flow ip; /**< IPv4 fields to match. */ + uint16_t src_port; /**< UDP source port in big endian. */ + uint16_t dst_port; /**< UDP destination port in big endian. */ +}; + +/** + * A structure used to define the input for IPV4 TCP flow + */ +struct rte_eth_tcpv4_flow { + struct rte_eth_ipv4_flow ip; /**< IPv4 fields to match. */ + uint16_t src_port; /**< TCP source port in big endian. */ + uint16_t dst_port; /**< TCP destination port in big endian. */ +}; + +/** + * A structure used to define the input for IPV4 SCTP flow + */ +struct rte_eth_sctpv4_flow { + struct rte_eth_ipv4_flow ip; /**< IPv4 fields to match. */ + uint16_t src_port; /**< SCTP source port in big endian. */ + uint16_t dst_port; /**< SCTP destination port in big endian. */ + uint32_t verify_tag; /**< Verify tag in big endian */ +}; + +/** + * A structure used to define the input for IPV6 flow + */ +struct rte_eth_ipv6_flow { + uint32_t src_ip[4]; /**< IPv6 source address in big endian. */ + uint32_t dst_ip[4]; /**< IPv6 destination address in big endian. */ + uint8_t tc; /**< Traffic class to match. */ + uint8_t proto; /**< Protocol, next header to match. */ + uint8_t hop_limits; /**< Hop limits to match. */ +}; + +/** + * A structure used to define the input for IPV6 UDP flow + */ +struct rte_eth_udpv6_flow { + struct rte_eth_ipv6_flow ip; /**< IPv6 fields to match. */ + uint16_t src_port; /**< UDP source port in big endian. */ + uint16_t dst_port; /**< UDP destination port in big endian. */ +}; + +/** + * A structure used to define the input for IPV6 TCP flow + */ +struct rte_eth_tcpv6_flow { + struct rte_eth_ipv6_flow ip; /**< IPv6 fields to match. */ + uint16_t src_port; /**< TCP source port to in big endian. */ + uint16_t dst_port; /**< TCP destination port in big endian. */ +}; + +/** + * A structure used to define the input for IPV6 SCTP flow + */ +struct rte_eth_sctpv6_flow { + struct rte_eth_ipv6_flow ip; /**< IPv6 fields to match. */ + uint16_t src_port; /**< SCTP source port in big endian. */ + uint16_t dst_port; /**< SCTP destination port in big endian. */ + uint32_t verify_tag; /**< Verify tag in big endian. */ +}; + +/** + * A structure used to define the input for MAC VLAN flow + */ +struct rte_eth_mac_vlan_flow { + struct ether_addr mac_addr; /**< Mac address to match. */ +}; + +/** + * Tunnel type for flow director. + */ +enum rte_eth_fdir_tunnel_type { + RTE_FDIR_TUNNEL_TYPE_UNKNOWN = 0, + RTE_FDIR_TUNNEL_TYPE_NVGRE, + RTE_FDIR_TUNNEL_TYPE_VXLAN, +}; + +/** + * A structure used to define the input for tunnel flow, now it's VxLAN or + * NVGRE + */ +struct rte_eth_tunnel_flow { + enum rte_eth_fdir_tunnel_type tunnel_type; /**< Tunnel type to match. */ + /** Tunnel ID to match. TNI, VNI... in big endian. */ + uint32_t tunnel_id; + struct ether_addr mac_addr; /**< Mac address to match. */ +}; + +/** + * An union contains the inputs for all types of flow + * Items in flows need to be in big endian + */ +union rte_eth_fdir_flow { + struct rte_eth_l2_flow l2_flow; + struct rte_eth_udpv4_flow udp4_flow; + struct rte_eth_tcpv4_flow tcp4_flow; + struct rte_eth_sctpv4_flow sctp4_flow; + struct rte_eth_ipv4_flow ip4_flow; + struct rte_eth_udpv6_flow udp6_flow; + struct rte_eth_tcpv6_flow tcp6_flow; + struct rte_eth_sctpv6_flow sctp6_flow; + struct rte_eth_ipv6_flow ipv6_flow; + struct rte_eth_mac_vlan_flow mac_vlan_flow; + struct rte_eth_tunnel_flow tunnel_flow; +}; + +/** + * A structure used to contain extend input of flow + */ +struct rte_eth_fdir_flow_ext { + uint16_t vlan_tci; + uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN]; + /**< It is filled by the flexible payload to match. */ + uint8_t is_vf; /**< 1 for VF, 0 for port dev */ + uint16_t dst_id; /**< VF ID, available when is_vf is 1*/ +}; + +/** + * A structure used to define the input for a flow director filter entry + */ +struct rte_eth_fdir_input { + uint16_t flow_type; + union rte_eth_fdir_flow flow; + /**< Flow fields to match, dependent on flow_type */ + struct rte_eth_fdir_flow_ext flow_ext; + /**< Additional fields to match */ +}; + +/** + * Behavior will be taken if FDIR match + */ +enum rte_eth_fdir_behavior { + RTE_ETH_FDIR_ACCEPT = 0, + RTE_ETH_FDIR_REJECT, + RTE_ETH_FDIR_PASSTHRU, +}; + +/** + * Flow director report status + * It defines what will be reported if FDIR entry is matched. + */ +enum rte_eth_fdir_status { + RTE_ETH_FDIR_NO_REPORT_STATUS = 0, /**< Report nothing. */ + RTE_ETH_FDIR_REPORT_ID, /**< Only report FD ID. */ + RTE_ETH_FDIR_REPORT_ID_FLEX_4, /**< Report FD ID and 4 flex bytes. */ + RTE_ETH_FDIR_REPORT_FLEX_8, /**< Report 8 flex bytes. */ +}; + +/** + * A structure used to define an action when match FDIR packet filter. + */ +struct rte_eth_fdir_action { + uint16_t rx_queue; /**< Queue assigned to if FDIR match. */ + enum rte_eth_fdir_behavior behavior; /**< Behavior will be taken */ + enum rte_eth_fdir_status report_status; /**< Status report option */ + uint8_t flex_off; + /**< If report_status is RTE_ETH_FDIR_REPORT_ID_FLEX_4 or + RTE_ETH_FDIR_REPORT_FLEX_8, flex_off specifies where the reported + flex bytes start from in flexible payload. */ +}; + +/** + * A structure used to define the flow director filter entry by filter_ctrl API + * It supports RTE_ETH_FILTER_FDIR with RTE_ETH_FILTER_ADD and + * RTE_ETH_FILTER_DELETE operations. + */ +struct rte_eth_fdir_filter { + uint32_t soft_id; + /**< ID, an unique value is required when deal with FDIR entry */ + struct rte_eth_fdir_input input; /**< Input set */ + struct rte_eth_fdir_action action; /**< Action taken when match */ +}; + +/** + * A structure used to configure FDIR masks that are used by the device + * to match the various fields of RX packet headers. + */ +struct rte_eth_fdir_masks { + uint16_t vlan_tci_mask; /**< Bit mask for vlan_tci in big endian */ + /** Bit mask for ipv4 flow in big endian. */ + struct rte_eth_ipv4_flow ipv4_mask; + /** Bit maks for ipv6 flow in big endian. */ + struct rte_eth_ipv6_flow ipv6_mask; + /** Bit mask for L4 source port in big endian. */ + uint16_t src_port_mask; + /** Bit mask for L4 destination port in big endian. */ + uint16_t dst_port_mask; + /** 6 bit mask for proper 6 bytes of Mac address, bit 0 matches the + first byte on the wire */ + uint8_t mac_addr_byte_mask; + /** Bit mask for tunnel ID in big endian. */ + uint32_t tunnel_id_mask; + uint8_t tunnel_type_mask; /**< 1 - Match tunnel type, + 0 - Ignore tunnel type. */ +}; + +/** + * Payload type + */ +enum rte_eth_payload_type { + RTE_ETH_PAYLOAD_UNKNOWN = 0, + RTE_ETH_RAW_PAYLOAD, + RTE_ETH_L2_PAYLOAD, + RTE_ETH_L3_PAYLOAD, + RTE_ETH_L4_PAYLOAD, + RTE_ETH_PAYLOAD_MAX = 8, +}; + +/** + * A structure used to select bytes extracted from the protocol layers to + * flexible payload for filter + */ +struct rte_eth_flex_payload_cfg { + enum rte_eth_payload_type type; /**< Payload type */ + uint16_t src_offset[RTE_ETH_FDIR_MAX_FLEXLEN]; + /**< Offset in bytes from the beginning of packet's payload + src_offset[i] indicates the flexbyte i's offset in original + packet payload. This value should be less than + flex_payload_limit in struct rte_eth_fdir_info.*/ +}; + +/** + * A structure used to define FDIR masks for flexible payload + * for each flow type + */ +struct rte_eth_fdir_flex_mask { + uint16_t flow_type; + uint8_t mask[RTE_ETH_FDIR_MAX_FLEXLEN]; + /**< Mask for the whole flexible payload */ +}; + +/** + * A structure used to define all flexible payload related setting + * include flex payload and flex mask + */ +struct rte_eth_fdir_flex_conf { + uint16_t nb_payloads; /**< The number of following payload cfg */ + uint16_t nb_flexmasks; /**< The number of following mask */ + struct rte_eth_flex_payload_cfg flex_set[RTE_ETH_PAYLOAD_MAX]; + /**< Flex payload configuration for each payload type */ + struct rte_eth_fdir_flex_mask flex_mask[RTE_ETH_FLOW_MAX]; + /**< Flex mask configuration for each flow type */ +}; + +/** + * Flow Director setting modes: none, signature or perfect. + */ +enum rte_fdir_mode { + RTE_FDIR_MODE_NONE = 0, /**< Disable FDIR support. */ + RTE_FDIR_MODE_SIGNATURE, /**< Enable FDIR signature filter mode. */ + RTE_FDIR_MODE_PERFECT, /**< Enable FDIR perfect filter mode. */ + RTE_FDIR_MODE_PERFECT_MAC_VLAN, /**< Enable FDIR filter mode - MAC VLAN. */ + RTE_FDIR_MODE_PERFECT_TUNNEL, /**< Enable FDIR filter mode - tunnel. */ +}; + +#define UINT64_BIT (CHAR_BIT * sizeof(uint64_t)) +#define RTE_FLOW_MASK_ARRAY_SIZE \ + (RTE_ALIGN(RTE_ETH_FLOW_MAX, UINT64_BIT)/UINT64_BIT) + +/** + * A structure used to get the information of flow director filter. + * It supports RTE_ETH_FILTER_FDIR with RTE_ETH_FILTER_INFO operation. + * It includes the mode, flexible payload configuration information, + * capabilities and supported flow types, flexible payload characters. + * It can be gotten to help taking specific configurations per device. + */ +struct rte_eth_fdir_info { + enum rte_fdir_mode mode; /**< Flow director mode */ + struct rte_eth_fdir_masks mask; + /** Flex payload configuration information */ + struct rte_eth_fdir_flex_conf flex_conf; + uint32_t guarant_spc; /**< Guaranteed spaces.*/ + uint32_t best_spc; /**< Best effort spaces.*/ + /** Bit mask for every supported flow type. */ + uint64_t flow_types_mask[RTE_FLOW_MASK_ARRAY_SIZE]; + uint32_t max_flexpayload; /**< Total flex payload in bytes. */ + /** Flexible payload unit in bytes. Size and alignments of all flex + payload segments should be multiplies of this value. */ + uint32_t flex_payload_unit; + /** Max number of flexible payload continuous segments. + Each segment should be a multiple of flex_payload_unit.*/ + uint32_t max_flex_payload_segment_num; + /** Maximum src_offset in bytes allowed. It indicates that + src_offset[i] in struct rte_eth_flex_payload_cfg should be less + than this value. */ + uint16_t flex_payload_limit; + /** Flex bitmask unit in bytes. Size of flex bitmasks should be a + multiply of this value. */ + uint32_t flex_bitmask_unit; + /** Max supported size of flex bitmasks in flex_bitmask_unit */ + uint32_t max_flex_bitmask_num; +}; + +/** + * A structure used to define the statistics of flow director. + * It supports RTE_ETH_FILTER_FDIR with RTE_ETH_FILTER_STATS operation. + */ +struct rte_eth_fdir_stats { + uint32_t collision; /**< Number of filters with collision. */ + uint32_t free; /**< Number of free filters. */ + uint32_t maxhash; + /**< The lookup hash value of the added filter that updated the value + of the MAXLEN field */ + uint32_t maxlen; /**< Longest linked list of filters. */ + uint64_t add; /**< Number of added filters. */ + uint64_t remove; /**< Number of removed filters. */ + uint64_t f_add; /**< Number of failed added filters. */ + uint64_t f_remove; /**< Number of failed removed filters. */ + uint32_t guarant_cnt; /**< Number of filters in guaranteed spaces. */ + uint32_t best_cnt; /**< Number of filters in best effort spaces. */ +}; + +/** + * Flow Director filter information types. + */ +enum rte_eth_fdir_filter_info_type { + RTE_ETH_FDIR_FILTER_INFO_TYPE_UNKNOWN = 0, + /** Flow Director filter input set configuration */ + RTE_ETH_FDIR_FILTER_INPUT_SET_SELECT, + RTE_ETH_FDIR_FILTER_INFO_TYPE_MAX, +}; + +/** + * A structure used to set FDIR filter information, to support filter type + * of 'RTE_ETH_FILTER_FDIR' RTE_ETH_FDIR_FILTER_INPUT_SET_SELECT operation. + */ +struct rte_eth_fdir_filter_info { + enum rte_eth_fdir_filter_info_type info_type; /**< Information type */ + /** Details of fdir filter information */ + union { + /** Flow Director input set configuration per port */ + struct rte_eth_input_set_conf input_set_conf; + } info; +}; + +/** + * Hash filter information types. + * - RTE_ETH_HASH_FILTER_SYM_HASH_ENA_PER_PORT is for getting/setting the + * information/configuration of 'symmetric hash enable' per port. + * - RTE_ETH_HASH_FILTER_GLOBAL_CONFIG is for getting/setting the global + * configurations of hash filters. Those global configurations are valid + * for all ports of the same NIC. + * - RTE_ETH_HASH_FILTER_INPUT_SET_SELECT is for setting the global + * hash input set fields + */ +enum rte_eth_hash_filter_info_type { + RTE_ETH_HASH_FILTER_INFO_TYPE_UNKNOWN = 0, + /** Symmetric hash enable per port */ + RTE_ETH_HASH_FILTER_SYM_HASH_ENA_PER_PORT, + /** Configure globally for hash filter */ + RTE_ETH_HASH_FILTER_GLOBAL_CONFIG, + /** Global Hash filter input set configuration */ + RTE_ETH_HASH_FILTER_INPUT_SET_SELECT, + RTE_ETH_HASH_FILTER_INFO_TYPE_MAX, +}; + +/** + * Hash function types. + */ +enum rte_eth_hash_function { + RTE_ETH_HASH_FUNCTION_DEFAULT = 0, + RTE_ETH_HASH_FUNCTION_TOEPLITZ, /**< Toeplitz */ + RTE_ETH_HASH_FUNCTION_SIMPLE_XOR, /**< Simple XOR */ + RTE_ETH_HASH_FUNCTION_MAX, +}; + +#define RTE_SYM_HASH_MASK_ARRAY_SIZE \ + (RTE_ALIGN(RTE_ETH_FLOW_MAX, UINT64_BIT)/UINT64_BIT) +/** + * A structure used to set or get global hash function configurations which + * include symmetric hash enable per flow type and hash function type. + * Each bit in sym_hash_enable_mask[] indicates if the symmetric hash of the + * corresponding flow type is enabled or not. + * Each bit in valid_bit_mask[] indicates if the corresponding bit in + * sym_hash_enable_mask[] is valid or not. For the configurations gotten, it + * also means if the flow type is supported by hardware or not. + */ +struct rte_eth_hash_global_conf { + enum rte_eth_hash_function hash_func; /**< Hash function type */ + /** Bit mask for symmetric hash enable per flow type */ + uint64_t sym_hash_enable_mask[RTE_SYM_HASH_MASK_ARRAY_SIZE]; + /** Bit mask indicates if the corresponding bit is valid */ + uint64_t valid_bit_mask[RTE_SYM_HASH_MASK_ARRAY_SIZE]; +}; + +/** + * A structure used to set or get hash filter information, to support filter + * type of 'RTE_ETH_FILTER_HASH' and its operations. + */ +struct rte_eth_hash_filter_info { + enum rte_eth_hash_filter_info_type info_type; /**< Information type */ + /** Details of hash filter information */ + union { + /** For RTE_ETH_HASH_FILTER_SYM_HASH_ENA_PER_PORT */ + uint8_t enable; + /** Global configurations of hash filter */ + struct rte_eth_hash_global_conf global_conf; + /** Global configurations of hash filter input set */ + struct rte_eth_input_set_conf input_set_conf; + } info; +}; + +/** + * l2 tunnel configuration. + */ +struct rte_eth_l2_tunnel_conf { + enum rte_eth_tunnel_type l2_tunnel_type; + uint16_t ether_type; /* ether type in l2 header */ + uint32_t tunnel_id; /* port tag id for e-tag */ + uint16_t vf_id; /* VF id for tag insertion */ + uint32_t pool; /* destination pool for tag based forwarding */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ETH_CTRL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.c b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.c new file mode 100644 index 00000000..4c320250 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.c @@ -0,0 +1,4425 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_ether.h" +#include "rte_ethdev.h" +#include "rte_ethdev_driver.h" +#include "ethdev_profile.h" + +int rte_eth_dev_logtype; + +static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data"; +struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS]; +static uint16_t eth_dev_last_created_port; + +/* spinlock for eth device callbacks */ +static rte_spinlock_t rte_eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER; + +/* spinlock for add/remove rx callbacks */ +static rte_spinlock_t rte_eth_rx_cb_lock = RTE_SPINLOCK_INITIALIZER; + +/* spinlock for add/remove tx callbacks */ +static rte_spinlock_t rte_eth_tx_cb_lock = RTE_SPINLOCK_INITIALIZER; + +/* spinlock for shared data allocation */ +static rte_spinlock_t rte_eth_shared_data_lock = RTE_SPINLOCK_INITIALIZER; + +/* store statistics names and its offset in stats structure */ +struct rte_eth_xstats_name_off { + char name[RTE_ETH_XSTATS_NAME_SIZE]; + unsigned offset; +}; + +/* Shared memory between primary and secondary processes. */ +static struct { + uint64_t next_owner_id; + rte_spinlock_t ownership_lock; + struct rte_eth_dev_data data[RTE_MAX_ETHPORTS]; +} *rte_eth_dev_shared_data; + +static const struct rte_eth_xstats_name_off rte_stats_strings[] = { + {"rx_good_packets", offsetof(struct rte_eth_stats, ipackets)}, + {"tx_good_packets", offsetof(struct rte_eth_stats, opackets)}, + {"rx_good_bytes", offsetof(struct rte_eth_stats, ibytes)}, + {"tx_good_bytes", offsetof(struct rte_eth_stats, obytes)}, + {"rx_missed_errors", offsetof(struct rte_eth_stats, imissed)}, + {"rx_errors", offsetof(struct rte_eth_stats, ierrors)}, + {"tx_errors", offsetof(struct rte_eth_stats, oerrors)}, + {"rx_mbuf_allocation_errors", offsetof(struct rte_eth_stats, + rx_nombuf)}, +}; + +#define RTE_NB_STATS (sizeof(rte_stats_strings) / sizeof(rte_stats_strings[0])) + +static const struct rte_eth_xstats_name_off rte_rxq_stats_strings[] = { + {"packets", offsetof(struct rte_eth_stats, q_ipackets)}, + {"bytes", offsetof(struct rte_eth_stats, q_ibytes)}, + {"errors", offsetof(struct rte_eth_stats, q_errors)}, +}; + +#define RTE_NB_RXQ_STATS (sizeof(rte_rxq_stats_strings) / \ + sizeof(rte_rxq_stats_strings[0])) + +static const struct rte_eth_xstats_name_off rte_txq_stats_strings[] = { + {"packets", offsetof(struct rte_eth_stats, q_opackets)}, + {"bytes", offsetof(struct rte_eth_stats, q_obytes)}, +}; +#define RTE_NB_TXQ_STATS (sizeof(rte_txq_stats_strings) / \ + sizeof(rte_txq_stats_strings[0])) + +#define RTE_RX_OFFLOAD_BIT2STR(_name) \ + { DEV_RX_OFFLOAD_##_name, #_name } + +static const struct { + uint64_t offload; + const char *name; +} rte_rx_offload_names[] = { + RTE_RX_OFFLOAD_BIT2STR(VLAN_STRIP), + RTE_RX_OFFLOAD_BIT2STR(IPV4_CKSUM), + RTE_RX_OFFLOAD_BIT2STR(UDP_CKSUM), + RTE_RX_OFFLOAD_BIT2STR(TCP_CKSUM), + RTE_RX_OFFLOAD_BIT2STR(TCP_LRO), + RTE_RX_OFFLOAD_BIT2STR(QINQ_STRIP), + RTE_RX_OFFLOAD_BIT2STR(OUTER_IPV4_CKSUM), + RTE_RX_OFFLOAD_BIT2STR(MACSEC_STRIP), + RTE_RX_OFFLOAD_BIT2STR(HEADER_SPLIT), + RTE_RX_OFFLOAD_BIT2STR(VLAN_FILTER), + RTE_RX_OFFLOAD_BIT2STR(VLAN_EXTEND), + RTE_RX_OFFLOAD_BIT2STR(JUMBO_FRAME), + RTE_RX_OFFLOAD_BIT2STR(CRC_STRIP), + RTE_RX_OFFLOAD_BIT2STR(SCATTER), + RTE_RX_OFFLOAD_BIT2STR(TIMESTAMP), + RTE_RX_OFFLOAD_BIT2STR(SECURITY), + RTE_RX_OFFLOAD_BIT2STR(KEEP_CRC), +}; + +#undef RTE_RX_OFFLOAD_BIT2STR + +#define RTE_TX_OFFLOAD_BIT2STR(_name) \ + { DEV_TX_OFFLOAD_##_name, #_name } + +static const struct { + uint64_t offload; + const char *name; +} rte_tx_offload_names[] = { + RTE_TX_OFFLOAD_BIT2STR(VLAN_INSERT), + RTE_TX_OFFLOAD_BIT2STR(IPV4_CKSUM), + RTE_TX_OFFLOAD_BIT2STR(UDP_CKSUM), + RTE_TX_OFFLOAD_BIT2STR(TCP_CKSUM), + RTE_TX_OFFLOAD_BIT2STR(SCTP_CKSUM), + RTE_TX_OFFLOAD_BIT2STR(TCP_TSO), + RTE_TX_OFFLOAD_BIT2STR(UDP_TSO), + RTE_TX_OFFLOAD_BIT2STR(OUTER_IPV4_CKSUM), + RTE_TX_OFFLOAD_BIT2STR(QINQ_INSERT), + RTE_TX_OFFLOAD_BIT2STR(VXLAN_TNL_TSO), + RTE_TX_OFFLOAD_BIT2STR(GRE_TNL_TSO), + RTE_TX_OFFLOAD_BIT2STR(IPIP_TNL_TSO), + RTE_TX_OFFLOAD_BIT2STR(GENEVE_TNL_TSO), + RTE_TX_OFFLOAD_BIT2STR(MACSEC_INSERT), + RTE_TX_OFFLOAD_BIT2STR(MT_LOCKFREE), + RTE_TX_OFFLOAD_BIT2STR(MULTI_SEGS), + RTE_TX_OFFLOAD_BIT2STR(MBUF_FAST_FREE), + RTE_TX_OFFLOAD_BIT2STR(SECURITY), +}; + +#undef RTE_TX_OFFLOAD_BIT2STR + +/** + * The user application callback description. + * + * It contains callback address to be registered by user application, + * the pointer to the parameters for callback, and the event type. + */ +struct rte_eth_dev_callback { + TAILQ_ENTRY(rte_eth_dev_callback) next; /**< Callbacks list */ + rte_eth_dev_cb_fn cb_fn; /**< Callback address */ + void *cb_arg; /**< Parameter for callback */ + void *ret_param; /**< Return parameter */ + enum rte_eth_event_type event; /**< Interrupt event type */ + uint32_t active; /**< Callback is executing */ +}; + +enum { + STAT_QMAP_TX = 0, + STAT_QMAP_RX +}; + +uint16_t +rte_eth_find_next(uint16_t port_id) +{ + while (port_id < RTE_MAX_ETHPORTS && + rte_eth_devices[port_id].state != RTE_ETH_DEV_ATTACHED && + rte_eth_devices[port_id].state != RTE_ETH_DEV_REMOVED) + port_id++; + + if (port_id >= RTE_MAX_ETHPORTS) + return RTE_MAX_ETHPORTS; + + return port_id; +} + +static void +rte_eth_dev_shared_data_prepare(void) +{ + const unsigned flags = 0; + const struct rte_memzone *mz; + + rte_spinlock_lock(&rte_eth_shared_data_lock); + + if (rte_eth_dev_shared_data == NULL) { + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* Allocate port data and ownership shared memory. */ + mz = rte_memzone_reserve(MZ_RTE_ETH_DEV_DATA, + sizeof(*rte_eth_dev_shared_data), + rte_socket_id(), flags); + } else + mz = rte_memzone_lookup(MZ_RTE_ETH_DEV_DATA); + if (mz == NULL) + rte_panic("Cannot allocate ethdev shared data\n"); + + rte_eth_dev_shared_data = mz->addr; + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_eth_dev_shared_data->next_owner_id = + RTE_ETH_DEV_NO_OWNER + 1; + rte_spinlock_init(&rte_eth_dev_shared_data->ownership_lock); + memset(rte_eth_dev_shared_data->data, 0, + sizeof(rte_eth_dev_shared_data->data)); + } + } + + rte_spinlock_unlock(&rte_eth_shared_data_lock); +} + +static bool +is_allocated(const struct rte_eth_dev *ethdev) +{ + return ethdev->data->name[0] != '\0'; +} + +static struct rte_eth_dev * +_rte_eth_dev_allocated(const char *name) +{ + unsigned i; + + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (rte_eth_devices[i].data != NULL && + strcmp(rte_eth_devices[i].data->name, name) == 0) + return &rte_eth_devices[i]; + } + return NULL; +} + +struct rte_eth_dev * +rte_eth_dev_allocated(const char *name) +{ + struct rte_eth_dev *ethdev; + + rte_eth_dev_shared_data_prepare(); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + ethdev = _rte_eth_dev_allocated(name); + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + + return ethdev; +} + +static uint16_t +rte_eth_dev_find_free_port(void) +{ + unsigned i; + + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + /* Using shared name field to find a free port. */ + if (rte_eth_dev_shared_data->data[i].name[0] == '\0') { + RTE_ASSERT(rte_eth_devices[i].state == + RTE_ETH_DEV_UNUSED); + return i; + } + } + return RTE_MAX_ETHPORTS; +} + +static struct rte_eth_dev * +eth_dev_get(uint16_t port_id) +{ + struct rte_eth_dev *eth_dev = &rte_eth_devices[port_id]; + + eth_dev->data = &rte_eth_dev_shared_data->data[port_id]; + + eth_dev_last_created_port = port_id; + + return eth_dev; +} + +struct rte_eth_dev * +rte_eth_dev_allocate(const char *name) +{ + uint16_t port_id; + struct rte_eth_dev *eth_dev = NULL; + + rte_eth_dev_shared_data_prepare(); + + /* Synchronize port creation between primary and secondary threads. */ + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + if (_rte_eth_dev_allocated(name) != NULL) { + RTE_ETHDEV_LOG(ERR, + "Ethernet device with name %s already allocated\n", + name); + goto unlock; + } + + port_id = rte_eth_dev_find_free_port(); + if (port_id == RTE_MAX_ETHPORTS) { + RTE_ETHDEV_LOG(ERR, + "Reached maximum number of Ethernet ports\n"); + goto unlock; + } + + eth_dev = eth_dev_get(port_id); + snprintf(eth_dev->data->name, sizeof(eth_dev->data->name), "%s", name); + eth_dev->data->port_id = port_id; + eth_dev->data->mtu = ETHER_MTU; + +unlock: + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + + return eth_dev; +} + +/* + * Attach to a port already registered by the primary process, which + * makes sure that the same device would have the same port id both + * in the primary and secondary process. + */ +struct rte_eth_dev * +rte_eth_dev_attach_secondary(const char *name) +{ + uint16_t i; + struct rte_eth_dev *eth_dev = NULL; + + rte_eth_dev_shared_data_prepare(); + + /* Synchronize port attachment to primary port creation and release. */ + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (strcmp(rte_eth_dev_shared_data->data[i].name, name) == 0) + break; + } + if (i == RTE_MAX_ETHPORTS) { + RTE_ETHDEV_LOG(ERR, + "Device %s is not driven by the primary process\n", + name); + } else { + eth_dev = eth_dev_get(i); + RTE_ASSERT(eth_dev->data->port_id == i); + } + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + return eth_dev; +} + +int +rte_eth_dev_release_port(struct rte_eth_dev *eth_dev) +{ + if (eth_dev == NULL) + return -EINVAL; + + rte_eth_dev_shared_data_prepare(); + + _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_DESTROY, NULL); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + eth_dev->state = RTE_ETH_DEV_UNUSED; + + memset(eth_dev->data, 0, sizeof(struct rte_eth_dev_data)); + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + + return 0; +} + +int +rte_eth_dev_is_valid_port(uint16_t port_id) +{ + if (port_id >= RTE_MAX_ETHPORTS || + (rte_eth_devices[port_id].state == RTE_ETH_DEV_UNUSED)) + return 0; + else + return 1; +} + +static int +rte_eth_is_valid_owner_id(uint64_t owner_id) +{ + if (owner_id == RTE_ETH_DEV_NO_OWNER || + rte_eth_dev_shared_data->next_owner_id <= owner_id) { + RTE_ETHDEV_LOG(ERR, "Invalid owner_id=%016"PRIx64"\n", + owner_id); + return 0; + } + return 1; +} + +uint64_t +rte_eth_find_next_owned_by(uint16_t port_id, const uint64_t owner_id) +{ + while (port_id < RTE_MAX_ETHPORTS && + ((rte_eth_devices[port_id].state != RTE_ETH_DEV_ATTACHED && + rte_eth_devices[port_id].state != RTE_ETH_DEV_REMOVED) || + rte_eth_devices[port_id].data->owner.id != owner_id)) + port_id++; + + if (port_id >= RTE_MAX_ETHPORTS) + return RTE_MAX_ETHPORTS; + + return port_id; +} + +int __rte_experimental +rte_eth_dev_owner_new(uint64_t *owner_id) +{ + rte_eth_dev_shared_data_prepare(); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + *owner_id = rte_eth_dev_shared_data->next_owner_id++; + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + return 0; +} + +static int +_rte_eth_dev_owner_set(const uint16_t port_id, const uint64_t old_owner_id, + const struct rte_eth_dev_owner *new_owner) +{ + struct rte_eth_dev *ethdev = &rte_eth_devices[port_id]; + struct rte_eth_dev_owner *port_owner; + int sret; + + if (port_id >= RTE_MAX_ETHPORTS || !is_allocated(ethdev)) { + RTE_ETHDEV_LOG(ERR, "Port id %"PRIu16" is not allocated\n", + port_id); + return -ENODEV; + } + + if (!rte_eth_is_valid_owner_id(new_owner->id) && + !rte_eth_is_valid_owner_id(old_owner_id)) + return -EINVAL; + + port_owner = &rte_eth_devices[port_id].data->owner; + if (port_owner->id != old_owner_id) { + RTE_ETHDEV_LOG(ERR, + "Cannot set owner to port %u already owned by %s_%016"PRIX64"\n", + port_id, port_owner->name, port_owner->id); + return -EPERM; + } + + sret = snprintf(port_owner->name, RTE_ETH_MAX_OWNER_NAME_LEN, "%s", + new_owner->name); + if (sret < 0 || sret >= RTE_ETH_MAX_OWNER_NAME_LEN) + RTE_ETHDEV_LOG(ERR, "Port %u owner name was truncated\n", + port_id); + + port_owner->id = new_owner->id; + + RTE_ETHDEV_LOG(DEBUG, "Port %u owner is %s_%016"PRIx64"\n", + port_id, new_owner->name, new_owner->id); + + return 0; +} + +int __rte_experimental +rte_eth_dev_owner_set(const uint16_t port_id, + const struct rte_eth_dev_owner *owner) +{ + int ret; + + rte_eth_dev_shared_data_prepare(); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + ret = _rte_eth_dev_owner_set(port_id, RTE_ETH_DEV_NO_OWNER, owner); + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + return ret; +} + +int __rte_experimental +rte_eth_dev_owner_unset(const uint16_t port_id, const uint64_t owner_id) +{ + const struct rte_eth_dev_owner new_owner = (struct rte_eth_dev_owner) + {.id = RTE_ETH_DEV_NO_OWNER, .name = ""}; + int ret; + + rte_eth_dev_shared_data_prepare(); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + ret = _rte_eth_dev_owner_set(port_id, owner_id, &new_owner); + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + return ret; +} + +void __rte_experimental +rte_eth_dev_owner_delete(const uint64_t owner_id) +{ + uint16_t port_id; + + rte_eth_dev_shared_data_prepare(); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + if (rte_eth_is_valid_owner_id(owner_id)) { + for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) + if (rte_eth_devices[port_id].data->owner.id == owner_id) + memset(&rte_eth_devices[port_id].data->owner, 0, + sizeof(struct rte_eth_dev_owner)); + RTE_ETHDEV_LOG(ERR, + "All port owners owned by %016"PRIx64" identifier have removed\n", + owner_id); + } + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); +} + +int __rte_experimental +rte_eth_dev_owner_get(const uint16_t port_id, struct rte_eth_dev_owner *owner) +{ + int ret = 0; + struct rte_eth_dev *ethdev = &rte_eth_devices[port_id]; + + rte_eth_dev_shared_data_prepare(); + + rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock); + + if (port_id >= RTE_MAX_ETHPORTS || !is_allocated(ethdev)) { + RTE_ETHDEV_LOG(ERR, "Port id %"PRIu16" is not allocated\n", + port_id); + ret = -ENODEV; + } else { + rte_memcpy(owner, ðdev->data->owner, sizeof(*owner)); + } + + rte_spinlock_unlock(&rte_eth_dev_shared_data->ownership_lock); + return ret; +} + +int +rte_eth_dev_socket_id(uint16_t port_id) +{ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -1); + return rte_eth_devices[port_id].data->numa_node; +} + +void * +rte_eth_dev_get_sec_ctx(uint16_t port_id) +{ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, NULL); + return rte_eth_devices[port_id].security_ctx; +} + +uint16_t +rte_eth_dev_count(void) +{ + return rte_eth_dev_count_avail(); +} + +uint16_t +rte_eth_dev_count_avail(void) +{ + uint16_t p; + uint16_t count; + + count = 0; + + RTE_ETH_FOREACH_DEV(p) + count++; + + return count; +} + +uint16_t __rte_experimental +rte_eth_dev_count_total(void) +{ + uint16_t port, count = 0; + + for (port = 0; port < RTE_MAX_ETHPORTS; port++) + if (rte_eth_devices[port].state != RTE_ETH_DEV_UNUSED) + count++; + + return count; +} + +int +rte_eth_dev_get_name_by_port(uint16_t port_id, char *name) +{ + char *tmp; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + if (name == NULL) { + RTE_ETHDEV_LOG(ERR, "Null pointer is specified\n"); + return -EINVAL; + } + + /* shouldn't check 'rte_eth_devices[i].data', + * because it might be overwritten by VDEV PMD */ + tmp = rte_eth_dev_shared_data->data[port_id].name; + strcpy(name, tmp); + return 0; +} + +int +rte_eth_dev_get_port_by_name(const char *name, uint16_t *port_id) +{ + uint32_t pid; + + if (name == NULL) { + RTE_ETHDEV_LOG(ERR, "Null pointer is specified\n"); + return -EINVAL; + } + + for (pid = 0; pid < RTE_MAX_ETHPORTS; pid++) { + if (rte_eth_devices[pid].state != RTE_ETH_DEV_UNUSED && + !strcmp(name, rte_eth_dev_shared_data->data[pid].name)) { + *port_id = pid; + return 0; + } + } + + return -ENODEV; +} + +static int +eth_err(uint16_t port_id, int ret) +{ + if (ret == 0) + return 0; + if (rte_eth_dev_is_removed(port_id)) + return -EIO; + return ret; +} + +/* attach the new device, then store port_id of the device */ +int +rte_eth_dev_attach(const char *devargs, uint16_t *port_id) +{ + int current = rte_eth_dev_count_total(); + struct rte_devargs da; + int ret = -1; + + memset(&da, 0, sizeof(da)); + + if ((devargs == NULL) || (port_id == NULL)) { + ret = -EINVAL; + goto err; + } + + /* parse devargs */ + if (rte_devargs_parse(&da, devargs)) + goto err; + + ret = rte_eal_hotplug_add(da.bus->name, da.name, da.args); + if (ret < 0) + goto err; + + /* no point looking at the port count if no port exists */ + if (!rte_eth_dev_count_total()) { + RTE_ETHDEV_LOG(ERR, "No port found for device (%s)\n", da.name); + ret = -1; + goto err; + } + + /* if nothing happened, there is a bug here, since some driver told us + * it did attach a device, but did not create a port. + * FIXME: race condition in case of plug-out of another device + */ + if (current == rte_eth_dev_count_total()) { + ret = -1; + goto err; + } + + *port_id = eth_dev_last_created_port; + ret = 0; + +err: + free(da.args); + return ret; +} + +/* detach the device, then store the name of the device */ +int +rte_eth_dev_detach(uint16_t port_id, char *name __rte_unused) +{ + struct rte_device *dev; + struct rte_bus *bus; + uint32_t dev_flags; + int ret = -1; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev_flags = rte_eth_devices[port_id].data->dev_flags; + if (dev_flags & RTE_ETH_DEV_BONDED_SLAVE) { + RTE_ETHDEV_LOG(ERR, + "Port %"PRIu16" is bonded, cannot detach\n", port_id); + return -ENOTSUP; + } + + dev = rte_eth_devices[port_id].device; + if (dev == NULL) + return -EINVAL; + + bus = rte_bus_find_by_device(dev); + if (bus == NULL) + return -ENOENT; + + ret = rte_eal_hotplug_remove(bus->name, dev->name); + if (ret < 0) + return ret; + + rte_eth_dev_release_port(&rte_eth_devices[port_id]); + return 0; +} + +static int +rte_eth_dev_rx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues) +{ + uint16_t old_nb_queues = dev->data->nb_rx_queues; + void **rxq; + unsigned i; + + if (dev->data->rx_queues == NULL && nb_queues != 0) { /* first time configuration */ + dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues", + sizeof(dev->data->rx_queues[0]) * nb_queues, + RTE_CACHE_LINE_SIZE); + if (dev->data->rx_queues == NULL) { + dev->data->nb_rx_queues = 0; + return -(ENOMEM); + } + } else if (dev->data->rx_queues != NULL && nb_queues != 0) { /* re-configure */ + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_release, -ENOTSUP); + + rxq = dev->data->rx_queues; + + for (i = nb_queues; i < old_nb_queues; i++) + (*dev->dev_ops->rx_queue_release)(rxq[i]); + rxq = rte_realloc(rxq, sizeof(rxq[0]) * nb_queues, + RTE_CACHE_LINE_SIZE); + if (rxq == NULL) + return -(ENOMEM); + if (nb_queues > old_nb_queues) { + uint16_t new_qs = nb_queues - old_nb_queues; + + memset(rxq + old_nb_queues, 0, + sizeof(rxq[0]) * new_qs); + } + + dev->data->rx_queues = rxq; + + } else if (dev->data->rx_queues != NULL && nb_queues == 0) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_release, -ENOTSUP); + + rxq = dev->data->rx_queues; + + for (i = nb_queues; i < old_nb_queues; i++) + (*dev->dev_ops->rx_queue_release)(rxq[i]); + + rte_free(dev->data->rx_queues); + dev->data->rx_queues = NULL; + } + dev->data->nb_rx_queues = nb_queues; + return 0; +} + +int +rte_eth_dev_rx_queue_start(uint16_t port_id, uint16_t rx_queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + if (!dev->data->dev_started) { + RTE_ETHDEV_LOG(ERR, + "Port %u must be started before start any queue\n", + port_id); + return -EINVAL; + } + + if (rx_queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start, -ENOTSUP); + + if (dev->data->rx_queue_state[rx_queue_id] != RTE_ETH_QUEUE_STATE_STOPPED) { + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already started\n", + rx_queue_id, port_id); + return 0; + } + + return eth_err(port_id, dev->dev_ops->rx_queue_start(dev, + rx_queue_id)); + +} + +int +rte_eth_dev_rx_queue_stop(uint16_t port_id, uint16_t rx_queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + if (rx_queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_stop, -ENOTSUP); + + if (dev->data->rx_queue_state[rx_queue_id] == RTE_ETH_QUEUE_STATE_STOPPED) { + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already stopped\n", + rx_queue_id, port_id); + return 0; + } + + return eth_err(port_id, dev->dev_ops->rx_queue_stop(dev, rx_queue_id)); + +} + +int +rte_eth_dev_tx_queue_start(uint16_t port_id, uint16_t tx_queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + if (!dev->data->dev_started) { + RTE_ETHDEV_LOG(ERR, + "Port %u must be started before start any queue\n", + port_id); + return -EINVAL; + } + + if (tx_queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_start, -ENOTSUP); + + if (dev->data->tx_queue_state[tx_queue_id] != RTE_ETH_QUEUE_STATE_STOPPED) { + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already started\n", + tx_queue_id, port_id); + return 0; + } + + return eth_err(port_id, dev->dev_ops->tx_queue_start(dev, tx_queue_id)); +} + +int +rte_eth_dev_tx_queue_stop(uint16_t port_id, uint16_t tx_queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + if (tx_queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_stop, -ENOTSUP); + + if (dev->data->tx_queue_state[tx_queue_id] == RTE_ETH_QUEUE_STATE_STOPPED) { + RTE_ETHDEV_LOG(INFO, + "Queue %"PRIu16" of device with port_id=%"PRIu16" already stopped\n", + tx_queue_id, port_id); + return 0; + } + + return eth_err(port_id, dev->dev_ops->tx_queue_stop(dev, tx_queue_id)); + +} + +static int +rte_eth_dev_tx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues) +{ + uint16_t old_nb_queues = dev->data->nb_tx_queues; + void **txq; + unsigned i; + + if (dev->data->tx_queues == NULL && nb_queues != 0) { /* first time configuration */ + dev->data->tx_queues = rte_zmalloc("ethdev->tx_queues", + sizeof(dev->data->tx_queues[0]) * nb_queues, + RTE_CACHE_LINE_SIZE); + if (dev->data->tx_queues == NULL) { + dev->data->nb_tx_queues = 0; + return -(ENOMEM); + } + } else if (dev->data->tx_queues != NULL && nb_queues != 0) { /* re-configure */ + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_release, -ENOTSUP); + + txq = dev->data->tx_queues; + + for (i = nb_queues; i < old_nb_queues; i++) + (*dev->dev_ops->tx_queue_release)(txq[i]); + txq = rte_realloc(txq, sizeof(txq[0]) * nb_queues, + RTE_CACHE_LINE_SIZE); + if (txq == NULL) + return -ENOMEM; + if (nb_queues > old_nb_queues) { + uint16_t new_qs = nb_queues - old_nb_queues; + + memset(txq + old_nb_queues, 0, + sizeof(txq[0]) * new_qs); + } + + dev->data->tx_queues = txq; + + } else if (dev->data->tx_queues != NULL && nb_queues == 0) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_release, -ENOTSUP); + + txq = dev->data->tx_queues; + + for (i = nb_queues; i < old_nb_queues; i++) + (*dev->dev_ops->tx_queue_release)(txq[i]); + + rte_free(dev->data->tx_queues); + dev->data->tx_queues = NULL; + } + dev->data->nb_tx_queues = nb_queues; + return 0; +} + +uint32_t +rte_eth_speed_bitflag(uint32_t speed, int duplex) +{ + switch (speed) { + case ETH_SPEED_NUM_10M: + return duplex ? ETH_LINK_SPEED_10M : ETH_LINK_SPEED_10M_HD; + case ETH_SPEED_NUM_100M: + return duplex ? ETH_LINK_SPEED_100M : ETH_LINK_SPEED_100M_HD; + case ETH_SPEED_NUM_1G: + return ETH_LINK_SPEED_1G; + case ETH_SPEED_NUM_2_5G: + return ETH_LINK_SPEED_2_5G; + case ETH_SPEED_NUM_5G: + return ETH_LINK_SPEED_5G; + case ETH_SPEED_NUM_10G: + return ETH_LINK_SPEED_10G; + case ETH_SPEED_NUM_20G: + return ETH_LINK_SPEED_20G; + case ETH_SPEED_NUM_25G: + return ETH_LINK_SPEED_25G; + case ETH_SPEED_NUM_40G: + return ETH_LINK_SPEED_40G; + case ETH_SPEED_NUM_50G: + return ETH_LINK_SPEED_50G; + case ETH_SPEED_NUM_56G: + return ETH_LINK_SPEED_56G; + case ETH_SPEED_NUM_100G: + return ETH_LINK_SPEED_100G; + default: + return 0; + } +} + +const char * __rte_experimental +rte_eth_dev_rx_offload_name(uint64_t offload) +{ + const char *name = "UNKNOWN"; + unsigned int i; + + for (i = 0; i < RTE_DIM(rte_rx_offload_names); ++i) { + if (offload == rte_rx_offload_names[i].offload) { + name = rte_rx_offload_names[i].name; + break; + } + } + + return name; +} + +const char * __rte_experimental +rte_eth_dev_tx_offload_name(uint64_t offload) +{ + const char *name = "UNKNOWN"; + unsigned int i; + + for (i = 0; i < RTE_DIM(rte_tx_offload_names); ++i) { + if (offload == rte_tx_offload_names[i].offload) { + name = rte_tx_offload_names[i].name; + break; + } + } + + return name; +} + +int +rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q, + const struct rte_eth_conf *dev_conf) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; + struct rte_eth_conf local_conf = *dev_conf; + int diag; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP); + + rte_eth_dev_info_get(port_id, &dev_info); + + /* If number of queues specified by application for both Rx and Tx is + * zero, use driver preferred values. This cannot be done individually + * as it is valid for either Tx or Rx (but not both) to be zero. + * If driver does not provide any preferred valued, fall back on + * EAL defaults. + */ + if (nb_rx_q == 0 && nb_tx_q == 0) { + nb_rx_q = dev_info.default_rxportconf.nb_queues; + if (nb_rx_q == 0) + nb_rx_q = RTE_ETH_DEV_FALLBACK_RX_NBQUEUES; + nb_tx_q = dev_info.default_txportconf.nb_queues; + if (nb_tx_q == 0) + nb_tx_q = RTE_ETH_DEV_FALLBACK_TX_NBQUEUES; + } + + if (nb_rx_q > RTE_MAX_QUEUES_PER_PORT) { + RTE_ETHDEV_LOG(ERR, + "Number of RX queues requested (%u) is greater than max supported(%d)\n", + nb_rx_q, RTE_MAX_QUEUES_PER_PORT); + return -EINVAL; + } + + if (nb_tx_q > RTE_MAX_QUEUES_PER_PORT) { + RTE_ETHDEV_LOG(ERR, + "Number of TX queues requested (%u) is greater than max supported(%d)\n", + nb_tx_q, RTE_MAX_QUEUES_PER_PORT); + return -EINVAL; + } + + if (dev->data->dev_started) { + RTE_ETHDEV_LOG(ERR, + "Port %u must be stopped to allow configuration\n", + port_id); + return -EBUSY; + } + + /* Copy the dev_conf parameter into the dev structure */ + memcpy(&dev->data->dev_conf, &local_conf, sizeof(dev->data->dev_conf)); + + /* + * Check that the numbers of RX and TX queues are not greater + * than the maximum number of RX and TX queues supported by the + * configured device. + */ + if (nb_rx_q > dev_info.max_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Ethdev port_id=%u nb_rx_queues=%u > %u\n", + port_id, nb_rx_q, dev_info.max_rx_queues); + return -EINVAL; + } + + if (nb_tx_q > dev_info.max_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Ethdev port_id=%u nb_tx_queues=%u > %u\n", + port_id, nb_tx_q, dev_info.max_tx_queues); + return -EINVAL; + } + + /* Check that the device supports requested interrupts */ + if ((dev_conf->intr_conf.lsc == 1) && + (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC))) { + RTE_ETHDEV_LOG(ERR, "Driver %s does not support lsc\n", + dev->device->driver->name); + return -EINVAL; + } + if ((dev_conf->intr_conf.rmv == 1) && + (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_RMV))) { + RTE_ETHDEV_LOG(ERR, "Driver %s does not support rmv\n", + dev->device->driver->name); + return -EINVAL; + } + + /* + * If jumbo frames are enabled, check that the maximum RX packet + * length is supported by the configured device. + */ + if (local_conf.rxmode.offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) { + if (dev_conf->rxmode.max_rx_pkt_len > dev_info.max_rx_pktlen) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u max_rx_pkt_len %u > max valid value %u\n", + port_id, dev_conf->rxmode.max_rx_pkt_len, + dev_info.max_rx_pktlen); + return -EINVAL; + } else if (dev_conf->rxmode.max_rx_pkt_len < ETHER_MIN_LEN) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u max_rx_pkt_len %u < min valid value %u\n", + port_id, dev_conf->rxmode.max_rx_pkt_len, + (unsigned)ETHER_MIN_LEN); + return -EINVAL; + } + } else { + if (dev_conf->rxmode.max_rx_pkt_len < ETHER_MIN_LEN || + dev_conf->rxmode.max_rx_pkt_len > ETHER_MAX_LEN) + /* Use default value */ + dev->data->dev_conf.rxmode.max_rx_pkt_len = + ETHER_MAX_LEN; + } + + /* Any requested offloading must be within its device capabilities */ + if ((local_conf.rxmode.offloads & dev_info.rx_offload_capa) != + local_conf.rxmode.offloads) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u requested Rx offloads 0x%"PRIx64" doesn't match Rx offloads " + "capabilities 0x%"PRIx64" in %s()\n", + port_id, local_conf.rxmode.offloads, + dev_info.rx_offload_capa, + __func__); + return -EINVAL; + } + if ((local_conf.txmode.offloads & dev_info.tx_offload_capa) != + local_conf.txmode.offloads) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u requested Tx offloads 0x%"PRIx64" doesn't match Tx offloads " + "capabilities 0x%"PRIx64" in %s()\n", + port_id, local_conf.txmode.offloads, + dev_info.tx_offload_capa, + __func__); + return -EINVAL; + } + + if ((local_conf.rxmode.offloads & DEV_RX_OFFLOAD_CRC_STRIP) && + (local_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)) { + RTE_ETHDEV_LOG(ERR, + "Port id=%u not allowed to set both CRC STRIP and KEEP CRC offload flags\n", + port_id); + return -EINVAL; + } + + /* Check that device supports requested rss hash functions. */ + if ((dev_info.flow_type_rss_offloads | + dev_conf->rx_adv_conf.rss_conf.rss_hf) != + dev_info.flow_type_rss_offloads) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u invalid rss_hf: 0x%"PRIx64", valid value: 0x%"PRIx64"\n", + port_id, dev_conf->rx_adv_conf.rss_conf.rss_hf, + dev_info.flow_type_rss_offloads); + return -EINVAL; + } + + /* + * Setup new number of RX/TX queues and reconfigure device. + */ + diag = rte_eth_dev_rx_queue_config(dev, nb_rx_q); + if (diag != 0) { + RTE_ETHDEV_LOG(ERR, + "Port%u rte_eth_dev_rx_queue_config = %d\n", + port_id, diag); + return diag; + } + + diag = rte_eth_dev_tx_queue_config(dev, nb_tx_q); + if (diag != 0) { + RTE_ETHDEV_LOG(ERR, + "Port%u rte_eth_dev_tx_queue_config = %d\n", + port_id, diag); + rte_eth_dev_rx_queue_config(dev, 0); + return diag; + } + + diag = (*dev->dev_ops->dev_configure)(dev); + if (diag != 0) { + RTE_ETHDEV_LOG(ERR, "Port%u dev_configure = %d\n", + port_id, diag); + rte_eth_dev_rx_queue_config(dev, 0); + rte_eth_dev_tx_queue_config(dev, 0); + return eth_err(port_id, diag); + } + + /* Initialize Rx profiling if enabled at compilation time. */ + diag = __rte_eth_profile_rx_init(port_id, dev); + if (diag != 0) { + RTE_ETHDEV_LOG(ERR, "Port%u __rte_eth_profile_rx_init = %d\n", + port_id, diag); + rte_eth_dev_rx_queue_config(dev, 0); + rte_eth_dev_tx_queue_config(dev, 0); + return eth_err(port_id, diag); + } + + return 0; +} + +void +_rte_eth_dev_reset(struct rte_eth_dev *dev) +{ + if (dev->data->dev_started) { + RTE_ETHDEV_LOG(ERR, "Port %u must be stopped to allow reset\n", + dev->data->port_id); + return; + } + + rte_eth_dev_rx_queue_config(dev, 0); + rte_eth_dev_tx_queue_config(dev, 0); + + memset(&dev->data->dev_conf, 0, sizeof(dev->data->dev_conf)); +} + +static void +rte_eth_dev_config_restore(uint16_t port_id) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; + struct ether_addr *addr; + uint16_t i; + uint32_t pool = 0; + uint64_t pool_mask; + + dev = &rte_eth_devices[port_id]; + + rte_eth_dev_info_get(port_id, &dev_info); + + /* replay MAC address configuration including default MAC */ + addr = &dev->data->mac_addrs[0]; + if (*dev->dev_ops->mac_addr_set != NULL) + (*dev->dev_ops->mac_addr_set)(dev, addr); + else if (*dev->dev_ops->mac_addr_add != NULL) + (*dev->dev_ops->mac_addr_add)(dev, addr, 0, pool); + + if (*dev->dev_ops->mac_addr_add != NULL) { + for (i = 1; i < dev_info.max_mac_addrs; i++) { + addr = &dev->data->mac_addrs[i]; + + /* skip zero address */ + if (is_zero_ether_addr(addr)) + continue; + + pool = 0; + pool_mask = dev->data->mac_pool_sel[i]; + + do { + if (pool_mask & 1ULL) + (*dev->dev_ops->mac_addr_add)(dev, + addr, i, pool); + pool_mask >>= 1; + pool++; + } while (pool_mask); + } + } + + /* replay promiscuous configuration */ + if (rte_eth_promiscuous_get(port_id) == 1) + rte_eth_promiscuous_enable(port_id); + else if (rte_eth_promiscuous_get(port_id) == 0) + rte_eth_promiscuous_disable(port_id); + + /* replay all multicast configuration */ + if (rte_eth_allmulticast_get(port_id) == 1) + rte_eth_allmulticast_enable(port_id); + else if (rte_eth_allmulticast_get(port_id) == 0) + rte_eth_allmulticast_disable(port_id); +} + +int +rte_eth_dev_start(uint16_t port_id) +{ + struct rte_eth_dev *dev; + int diag; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_start, -ENOTSUP); + + if (dev->data->dev_started != 0) { + RTE_ETHDEV_LOG(INFO, + "Device with port_id=%"PRIu16" already started\n", + port_id); + return 0; + } + + diag = (*dev->dev_ops->dev_start)(dev); + if (diag == 0) + dev->data->dev_started = 1; + else + return eth_err(port_id, diag); + + rte_eth_dev_config_restore(port_id); + + if (dev->data->dev_conf.intr_conf.lsc == 0) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->link_update, -ENOTSUP); + (*dev->dev_ops->link_update)(dev, 0); + } + return 0; +} + +void +rte_eth_dev_stop(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_stop); + + if (dev->data->dev_started == 0) { + RTE_ETHDEV_LOG(INFO, + "Device with port_id=%"PRIu16" already stopped\n", + port_id); + return; + } + + dev->data->dev_started = 0; + (*dev->dev_ops->dev_stop)(dev); +} + +int +rte_eth_dev_set_link_up(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_set_link_up, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->dev_set_link_up)(dev)); +} + +int +rte_eth_dev_set_link_down(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_set_link_down, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->dev_set_link_down)(dev)); +} + +void +rte_eth_dev_close(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_close); + dev->data->dev_started = 0; + (*dev->dev_ops->dev_close)(dev); + + dev->data->nb_rx_queues = 0; + rte_free(dev->data->rx_queues); + dev->data->rx_queues = NULL; + dev->data->nb_tx_queues = 0; + rte_free(dev->data->tx_queues); + dev->data->tx_queues = NULL; +} + +int +rte_eth_dev_reset(uint16_t port_id) +{ + struct rte_eth_dev *dev; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP); + + rte_eth_dev_stop(port_id); + ret = dev->dev_ops->dev_reset(dev); + + return eth_err(port_id, ret); +} + +int __rte_experimental +rte_eth_dev_is_removed(uint16_t port_id) +{ + struct rte_eth_dev *dev; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0); + + dev = &rte_eth_devices[port_id]; + + if (dev->state == RTE_ETH_DEV_REMOVED) + return 1; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->is_removed, 0); + + ret = dev->dev_ops->is_removed(dev); + if (ret != 0) + /* Device is physically removed. */ + dev->state = RTE_ETH_DEV_REMOVED; + + return ret; +} + +int +rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, + uint16_t nb_rx_desc, unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mp) +{ + int ret; + uint32_t mbp_buf_size; + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; + struct rte_eth_rxconf local_conf; + void **rxq; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + if (rx_queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_setup, -ENOTSUP); + + /* + * Check the size of the mbuf data buffer. + * This value must be provided in the private data of the memory pool. + * First check that the memory pool has a valid private data. + */ + rte_eth_dev_info_get(port_id, &dev_info); + if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) { + RTE_ETHDEV_LOG(ERR, "%s private_data_size %d < %d\n", + mp->name, (int)mp->private_data_size, + (int)sizeof(struct rte_pktmbuf_pool_private)); + return -ENOSPC; + } + mbp_buf_size = rte_pktmbuf_data_room_size(mp); + + if ((mbp_buf_size - RTE_PKTMBUF_HEADROOM) < dev_info.min_rx_bufsize) { + RTE_ETHDEV_LOG(ERR, + "%s mbuf_data_room_size %d < %d (RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)=%d)\n", + mp->name, (int)mbp_buf_size, + (int)(RTE_PKTMBUF_HEADROOM + dev_info.min_rx_bufsize), + (int)RTE_PKTMBUF_HEADROOM, + (int)dev_info.min_rx_bufsize); + return -EINVAL; + } + + /* Use default specified by driver, if nb_rx_desc is zero */ + if (nb_rx_desc == 0) { + nb_rx_desc = dev_info.default_rxportconf.ring_size; + /* If driver default is also zero, fall back on EAL default */ + if (nb_rx_desc == 0) + nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE; + } + + if (nb_rx_desc > dev_info.rx_desc_lim.nb_max || + nb_rx_desc < dev_info.rx_desc_lim.nb_min || + nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) { + + RTE_ETHDEV_LOG(ERR, + "Invalid value for nb_rx_desc(=%hu), should be: <= %hu, = %hu, and a product of %hu\n", + nb_rx_desc, dev_info.rx_desc_lim.nb_max, + dev_info.rx_desc_lim.nb_min, + dev_info.rx_desc_lim.nb_align); + return -EINVAL; + } + + if (dev->data->dev_started && + !(dev_info.dev_capa & + RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP)) + return -EBUSY; + + if (dev->data->dev_started && + (dev->data->rx_queue_state[rx_queue_id] != + RTE_ETH_QUEUE_STATE_STOPPED)) + return -EBUSY; + + rxq = dev->data->rx_queues; + if (rxq[rx_queue_id]) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_release, + -ENOTSUP); + (*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]); + rxq[rx_queue_id] = NULL; + } + + if (rx_conf == NULL) + rx_conf = &dev_info.default_rxconf; + + local_conf = *rx_conf; + + /* + * If an offloading has already been enabled in + * rte_eth_dev_configure(), it has been enabled on all queues, + * so there is no need to enable it in this queue again. + * The local_conf.offloads input to underlying PMD only carries + * those offloadings which are only enabled on this queue and + * not enabled on all queues. + */ + local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads; + + /* + * New added offloadings for this queue are those not enabled in + * rte_eth_dev_configure() and they must be per-queue type. + * A pure per-port offloading can't be enabled on a queue while + * disabled on another queue. A pure per-port offloading can't + * be enabled for any queue as new added one if it hasn't been + * enabled in rte_eth_dev_configure(). + */ + if ((local_conf.offloads & dev_info.rx_queue_offload_capa) != + local_conf.offloads) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%d rx_queue_id=%d, new added offloads 0x%"PRIx64" must be " + "within pre-queue offload capabilities 0x%"PRIx64" in %s()\n", + port_id, rx_queue_id, local_conf.offloads, + dev_info.rx_queue_offload_capa, + __func__); + return -EINVAL; + } + + ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc, + socket_id, &local_conf, mp); + if (!ret) { + if (!dev->data->min_rx_buf_size || + dev->data->min_rx_buf_size > mbp_buf_size) + dev->data->min_rx_buf_size = mbp_buf_size; + } + + return eth_err(port_id, ret); +} + +int +rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, + uint16_t nb_tx_desc, unsigned int socket_id, + const struct rte_eth_txconf *tx_conf) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; + struct rte_eth_txconf local_conf; + void **txq; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + if (tx_queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_setup, -ENOTSUP); + + rte_eth_dev_info_get(port_id, &dev_info); + + /* Use default specified by driver, if nb_tx_desc is zero */ + if (nb_tx_desc == 0) { + nb_tx_desc = dev_info.default_txportconf.ring_size; + /* If driver default is zero, fall back on EAL default */ + if (nb_tx_desc == 0) + nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE; + } + if (nb_tx_desc > dev_info.tx_desc_lim.nb_max || + nb_tx_desc < dev_info.tx_desc_lim.nb_min || + nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) { + RTE_ETHDEV_LOG(ERR, + "Invalid value for nb_tx_desc(=%hu), should be: <= %hu, = %hu, and a product of %hu\n", + nb_tx_desc, dev_info.tx_desc_lim.nb_max, + dev_info.tx_desc_lim.nb_min, + dev_info.tx_desc_lim.nb_align); + return -EINVAL; + } + + if (dev->data->dev_started && + !(dev_info.dev_capa & + RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP)) + return -EBUSY; + + if (dev->data->dev_started && + (dev->data->tx_queue_state[tx_queue_id] != + RTE_ETH_QUEUE_STATE_STOPPED)) + return -EBUSY; + + txq = dev->data->tx_queues; + if (txq[tx_queue_id]) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_release, + -ENOTSUP); + (*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]); + txq[tx_queue_id] = NULL; + } + + if (tx_conf == NULL) + tx_conf = &dev_info.default_txconf; + + local_conf = *tx_conf; + + /* + * If an offloading has already been enabled in + * rte_eth_dev_configure(), it has been enabled on all queues, + * so there is no need to enable it in this queue again. + * The local_conf.offloads input to underlying PMD only carries + * those offloadings which are only enabled on this queue and + * not enabled on all queues. + */ + local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads; + + /* + * New added offloadings for this queue are those not enabled in + * rte_eth_dev_configure() and they must be per-queue type. + * A pure per-port offloading can't be enabled on a queue while + * disabled on another queue. A pure per-port offloading can't + * be enabled for any queue as new added one if it hasn't been + * enabled in rte_eth_dev_configure(). + */ + if ((local_conf.offloads & dev_info.tx_queue_offload_capa) != + local_conf.offloads) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%d tx_queue_id=%d, new added offloads 0x%"PRIx64" must be " + "within pre-queue offload capabilities 0x%"PRIx64" in %s()\n", + port_id, tx_queue_id, local_conf.offloads, + dev_info.tx_queue_offload_capa, + __func__); + return -EINVAL; + } + + return eth_err(port_id, (*dev->dev_ops->tx_queue_setup)(dev, + tx_queue_id, nb_tx_desc, socket_id, &local_conf)); +} + +void +rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t unsent, + void *userdata __rte_unused) +{ + unsigned i; + + for (i = 0; i < unsent; i++) + rte_pktmbuf_free(pkts[i]); +} + +void +rte_eth_tx_buffer_count_callback(struct rte_mbuf **pkts, uint16_t unsent, + void *userdata) +{ + uint64_t *count = userdata; + unsigned i; + + for (i = 0; i < unsent; i++) + rte_pktmbuf_free(pkts[i]); + + *count += unsent; +} + +int +rte_eth_tx_buffer_set_err_callback(struct rte_eth_dev_tx_buffer *buffer, + buffer_tx_error_fn cbfn, void *userdata) +{ + buffer->error_callback = cbfn; + buffer->error_userdata = userdata; + return 0; +} + +int +rte_eth_tx_buffer_init(struct rte_eth_dev_tx_buffer *buffer, uint16_t size) +{ + int ret = 0; + + if (buffer == NULL) + return -EINVAL; + + buffer->size = size; + if (buffer->error_callback == NULL) { + ret = rte_eth_tx_buffer_set_err_callback( + buffer, rte_eth_tx_buffer_drop_callback, NULL); + } + + return ret; +} + +int +rte_eth_tx_done_cleanup(uint16_t port_id, uint16_t queue_id, uint32_t free_cnt) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + int ret; + + /* Validate Input Data. Bail if not valid or not supported. */ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_done_cleanup, -ENOTSUP); + + /* Call driver to free pending mbufs. */ + ret = (*dev->dev_ops->tx_done_cleanup)(dev->data->tx_queues[queue_id], + free_cnt); + return eth_err(port_id, ret); +} + +void +rte_eth_promiscuous_enable(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->promiscuous_enable); + (*dev->dev_ops->promiscuous_enable)(dev); + dev->data->promiscuous = 1; +} + +void +rte_eth_promiscuous_disable(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->promiscuous_disable); + dev->data->promiscuous = 0; + (*dev->dev_ops->promiscuous_disable)(dev); +} + +int +rte_eth_promiscuous_get(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + return dev->data->promiscuous; +} + +void +rte_eth_allmulticast_enable(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->allmulticast_enable); + (*dev->dev_ops->allmulticast_enable)(dev); + dev->data->all_multicast = 1; +} + +void +rte_eth_allmulticast_disable(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->allmulticast_disable); + dev->data->all_multicast = 0; + (*dev->dev_ops->allmulticast_disable)(dev); +} + +int +rte_eth_allmulticast_get(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + return dev->data->all_multicast; +} + +void +rte_eth_link_get(uint16_t port_id, struct rte_eth_link *eth_link) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + if (dev->data->dev_conf.intr_conf.lsc && + dev->data->dev_started) + rte_eth_linkstatus_get(dev, eth_link); + else { + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->link_update); + (*dev->dev_ops->link_update)(dev, 1); + *eth_link = dev->data->dev_link; + } +} + +void +rte_eth_link_get_nowait(uint16_t port_id, struct rte_eth_link *eth_link) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + if (dev->data->dev_conf.intr_conf.lsc && + dev->data->dev_started) + rte_eth_linkstatus_get(dev, eth_link); + else { + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->link_update); + (*dev->dev_ops->link_update)(dev, 0); + *eth_link = dev->data->dev_link; + } +} + +int +rte_eth_stats_get(uint16_t port_id, struct rte_eth_stats *stats) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + memset(stats, 0, sizeof(*stats)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP); + stats->rx_nombuf = dev->data->rx_mbuf_alloc_failed; + return eth_err(port_id, (*dev->dev_ops->stats_get)(dev, stats)); +} + +int +rte_eth_stats_reset(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP); + (*dev->dev_ops->stats_reset)(dev); + dev->data->rx_mbuf_alloc_failed = 0; + + return 0; +} + +static inline int +get_xstats_basic_count(struct rte_eth_dev *dev) +{ + uint16_t nb_rxqs, nb_txqs; + int count; + + nb_rxqs = RTE_MIN(dev->data->nb_rx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + nb_txqs = RTE_MIN(dev->data->nb_tx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + + count = RTE_NB_STATS; + count += nb_rxqs * RTE_NB_RXQ_STATS; + count += nb_txqs * RTE_NB_TXQ_STATS; + + return count; +} + +static int +get_xstats_count(uint16_t port_id) +{ + struct rte_eth_dev *dev; + int count; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + dev = &rte_eth_devices[port_id]; + if (dev->dev_ops->xstats_get_names_by_id != NULL) { + count = (*dev->dev_ops->xstats_get_names_by_id)(dev, NULL, + NULL, 0); + if (count < 0) + return eth_err(port_id, count); + } + if (dev->dev_ops->xstats_get_names != NULL) { + count = (*dev->dev_ops->xstats_get_names)(dev, NULL, 0); + if (count < 0) + return eth_err(port_id, count); + } else + count = 0; + + + count += get_xstats_basic_count(dev); + + return count; +} + +int +rte_eth_xstats_get_id_by_name(uint16_t port_id, const char *xstat_name, + uint64_t *id) +{ + int cnt_xstats, idx_xstat; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (!id) { + RTE_ETHDEV_LOG(ERR, "Id pointer is NULL\n"); + return -ENOMEM; + } + + if (!xstat_name) { + RTE_ETHDEV_LOG(ERR, "xstat_name pointer is NULL\n"); + return -ENOMEM; + } + + /* Get count */ + cnt_xstats = rte_eth_xstats_get_names_by_id(port_id, NULL, 0, NULL); + if (cnt_xstats < 0) { + RTE_ETHDEV_LOG(ERR, "Cannot get count of xstats\n"); + return -ENODEV; + } + + /* Get id-name lookup table */ + struct rte_eth_xstat_name xstats_names[cnt_xstats]; + + if (cnt_xstats != rte_eth_xstats_get_names_by_id( + port_id, xstats_names, cnt_xstats, NULL)) { + RTE_ETHDEV_LOG(ERR, "Cannot get xstats lookup\n"); + return -1; + } + + for (idx_xstat = 0; idx_xstat < cnt_xstats; idx_xstat++) { + if (!strcmp(xstats_names[idx_xstat].name, xstat_name)) { + *id = idx_xstat; + return 0; + }; + } + + return -EINVAL; +} + +/* retrieve basic stats names */ +static int +rte_eth_basic_stats_get_names(struct rte_eth_dev *dev, + struct rte_eth_xstat_name *xstats_names) +{ + int cnt_used_entries = 0; + uint32_t idx, id_queue; + uint16_t num_q; + + for (idx = 0; idx < RTE_NB_STATS; idx++) { + snprintf(xstats_names[cnt_used_entries].name, + sizeof(xstats_names[0].name), + "%s", rte_stats_strings[idx].name); + cnt_used_entries++; + } + num_q = RTE_MIN(dev->data->nb_rx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + for (id_queue = 0; id_queue < num_q; id_queue++) { + for (idx = 0; idx < RTE_NB_RXQ_STATS; idx++) { + snprintf(xstats_names[cnt_used_entries].name, + sizeof(xstats_names[0].name), + "rx_q%u%s", + id_queue, rte_rxq_stats_strings[idx].name); + cnt_used_entries++; + } + + } + num_q = RTE_MIN(dev->data->nb_tx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + for (id_queue = 0; id_queue < num_q; id_queue++) { + for (idx = 0; idx < RTE_NB_TXQ_STATS; idx++) { + snprintf(xstats_names[cnt_used_entries].name, + sizeof(xstats_names[0].name), + "tx_q%u%s", + id_queue, rte_txq_stats_strings[idx].name); + cnt_used_entries++; + } + } + return cnt_used_entries; +} + +/* retrieve ethdev extended statistics names */ +int +rte_eth_xstats_get_names_by_id(uint16_t port_id, + struct rte_eth_xstat_name *xstats_names, unsigned int size, + uint64_t *ids) +{ + struct rte_eth_xstat_name *xstats_names_copy; + unsigned int no_basic_stat_requested = 1; + unsigned int no_ext_stat_requested = 1; + unsigned int expected_entries; + unsigned int basic_count; + struct rte_eth_dev *dev; + unsigned int i; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + basic_count = get_xstats_basic_count(dev); + ret = get_xstats_count(port_id); + if (ret < 0) + return ret; + expected_entries = (unsigned int)ret; + + /* Return max number of stats if no ids given */ + if (!ids) { + if (!xstats_names) + return expected_entries; + else if (xstats_names && size < expected_entries) + return expected_entries; + } + + if (ids && !xstats_names) + return -EINVAL; + + if (ids && dev->dev_ops->xstats_get_names_by_id != NULL && size > 0) { + uint64_t ids_copy[size]; + + for (i = 0; i < size; i++) { + if (ids[i] < basic_count) { + no_basic_stat_requested = 0; + break; + } + + /* + * Convert ids to xstats ids that PMD knows. + * ids known by user are basic + extended stats. + */ + ids_copy[i] = ids[i] - basic_count; + } + + if (no_basic_stat_requested) + return (*dev->dev_ops->xstats_get_names_by_id)(dev, + xstats_names, ids_copy, size); + } + + /* Retrieve all stats */ + if (!ids) { + int num_stats = rte_eth_xstats_get_names(port_id, xstats_names, + expected_entries); + if (num_stats < 0 || num_stats > (int)expected_entries) + return num_stats; + else + return expected_entries; + } + + xstats_names_copy = calloc(expected_entries, + sizeof(struct rte_eth_xstat_name)); + + if (!xstats_names_copy) { + RTE_ETHDEV_LOG(ERR, "Can't allocate memory\n"); + return -ENOMEM; + } + + if (ids) { + for (i = 0; i < size; i++) { + if (ids[i] >= basic_count) { + no_ext_stat_requested = 0; + break; + } + } + } + + /* Fill xstats_names_copy structure */ + if (ids && no_ext_stat_requested) { + rte_eth_basic_stats_get_names(dev, xstats_names_copy); + } else { + ret = rte_eth_xstats_get_names(port_id, xstats_names_copy, + expected_entries); + if (ret < 0) { + free(xstats_names_copy); + return ret; + } + } + + /* Filter stats */ + for (i = 0; i < size; i++) { + if (ids[i] >= expected_entries) { + RTE_ETHDEV_LOG(ERR, "Id value isn't valid\n"); + free(xstats_names_copy); + return -1; + } + xstats_names[i] = xstats_names_copy[ids[i]]; + } + + free(xstats_names_copy); + return size; +} + +int +rte_eth_xstats_get_names(uint16_t port_id, + struct rte_eth_xstat_name *xstats_names, + unsigned int size) +{ + struct rte_eth_dev *dev; + int cnt_used_entries; + int cnt_expected_entries; + int cnt_driver_entries; + + cnt_expected_entries = get_xstats_count(port_id); + if (xstats_names == NULL || cnt_expected_entries < 0 || + (int)size < cnt_expected_entries) + return cnt_expected_entries; + + /* port_id checked in get_xstats_count() */ + dev = &rte_eth_devices[port_id]; + + cnt_used_entries = rte_eth_basic_stats_get_names( + dev, xstats_names); + + if (dev->dev_ops->xstats_get_names != NULL) { + /* If there are any driver-specific xstats, append them + * to end of list. + */ + cnt_driver_entries = (*dev->dev_ops->xstats_get_names)( + dev, + xstats_names + cnt_used_entries, + size - cnt_used_entries); + if (cnt_driver_entries < 0) + return eth_err(port_id, cnt_driver_entries); + cnt_used_entries += cnt_driver_entries; + } + + return cnt_used_entries; +} + + +static int +rte_eth_basic_stats_get(uint16_t port_id, struct rte_eth_xstat *xstats) +{ + struct rte_eth_dev *dev; + struct rte_eth_stats eth_stats; + unsigned int count = 0, i, q; + uint64_t val, *stats_ptr; + uint16_t nb_rxqs, nb_txqs; + int ret; + + ret = rte_eth_stats_get(port_id, ð_stats); + if (ret < 0) + return ret; + + dev = &rte_eth_devices[port_id]; + + nb_rxqs = RTE_MIN(dev->data->nb_rx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + nb_txqs = RTE_MIN(dev->data->nb_tx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + + /* global stats */ + for (i = 0; i < RTE_NB_STATS; i++) { + stats_ptr = RTE_PTR_ADD(ð_stats, + rte_stats_strings[i].offset); + val = *stats_ptr; + xstats[count++].value = val; + } + + /* per-rxq stats */ + for (q = 0; q < nb_rxqs; q++) { + for (i = 0; i < RTE_NB_RXQ_STATS; i++) { + stats_ptr = RTE_PTR_ADD(ð_stats, + rte_rxq_stats_strings[i].offset + + q * sizeof(uint64_t)); + val = *stats_ptr; + xstats[count++].value = val; + } + } + + /* per-txq stats */ + for (q = 0; q < nb_txqs; q++) { + for (i = 0; i < RTE_NB_TXQ_STATS; i++) { + stats_ptr = RTE_PTR_ADD(ð_stats, + rte_txq_stats_strings[i].offset + + q * sizeof(uint64_t)); + val = *stats_ptr; + xstats[count++].value = val; + } + } + return count; +} + +/* retrieve ethdev extended statistics */ +int +rte_eth_xstats_get_by_id(uint16_t port_id, const uint64_t *ids, + uint64_t *values, unsigned int size) +{ + unsigned int no_basic_stat_requested = 1; + unsigned int no_ext_stat_requested = 1; + unsigned int num_xstats_filled; + unsigned int basic_count; + uint16_t expected_entries; + struct rte_eth_dev *dev; + unsigned int i; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + ret = get_xstats_count(port_id); + if (ret < 0) + return ret; + expected_entries = (uint16_t)ret; + struct rte_eth_xstat xstats[expected_entries]; + dev = &rte_eth_devices[port_id]; + basic_count = get_xstats_basic_count(dev); + + /* Return max number of stats if no ids given */ + if (!ids) { + if (!values) + return expected_entries; + else if (values && size < expected_entries) + return expected_entries; + } + + if (ids && !values) + return -EINVAL; + + if (ids && dev->dev_ops->xstats_get_by_id != NULL && size) { + unsigned int basic_count = get_xstats_basic_count(dev); + uint64_t ids_copy[size]; + + for (i = 0; i < size; i++) { + if (ids[i] < basic_count) { + no_basic_stat_requested = 0; + break; + } + + /* + * Convert ids to xstats ids that PMD knows. + * ids known by user are basic + extended stats. + */ + ids_copy[i] = ids[i] - basic_count; + } + + if (no_basic_stat_requested) + return (*dev->dev_ops->xstats_get_by_id)(dev, ids_copy, + values, size); + } + + if (ids) { + for (i = 0; i < size; i++) { + if (ids[i] >= basic_count) { + no_ext_stat_requested = 0; + break; + } + } + } + + /* Fill the xstats structure */ + if (ids && no_ext_stat_requested) + ret = rte_eth_basic_stats_get(port_id, xstats); + else + ret = rte_eth_xstats_get(port_id, xstats, expected_entries); + + if (ret < 0) + return ret; + num_xstats_filled = (unsigned int)ret; + + /* Return all stats */ + if (!ids) { + for (i = 0; i < num_xstats_filled; i++) + values[i] = xstats[i].value; + return expected_entries; + } + + /* Filter stats */ + for (i = 0; i < size; i++) { + if (ids[i] >= expected_entries) { + RTE_ETHDEV_LOG(ERR, "Id value isn't valid\n"); + return -1; + } + values[i] = xstats[ids[i]].value; + } + return size; +} + +int +rte_eth_xstats_get(uint16_t port_id, struct rte_eth_xstat *xstats, + unsigned int n) +{ + struct rte_eth_dev *dev; + unsigned int count = 0, i; + signed int xcount = 0; + uint16_t nb_rxqs, nb_txqs; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + + dev = &rte_eth_devices[port_id]; + + nb_rxqs = RTE_MIN(dev->data->nb_rx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + nb_txqs = RTE_MIN(dev->data->nb_tx_queues, RTE_ETHDEV_QUEUE_STAT_CNTRS); + + /* Return generic statistics */ + count = RTE_NB_STATS + (nb_rxqs * RTE_NB_RXQ_STATS) + + (nb_txqs * RTE_NB_TXQ_STATS); + + /* implemented by the driver */ + if (dev->dev_ops->xstats_get != NULL) { + /* Retrieve the xstats from the driver at the end of the + * xstats struct. + */ + xcount = (*dev->dev_ops->xstats_get)(dev, + xstats ? xstats + count : NULL, + (n > count) ? n - count : 0); + + if (xcount < 0) + return eth_err(port_id, xcount); + } + + if (n < count + xcount || xstats == NULL) + return count + xcount; + + /* now fill the xstats structure */ + ret = rte_eth_basic_stats_get(port_id, xstats); + if (ret < 0) + return ret; + count = ret; + + for (i = 0; i < count; i++) + xstats[i].id = i; + /* add an offset to driver-specific stats */ + for ( ; i < count + xcount; i++) + xstats[i].id += count; + + return count + xcount; +} + +/* reset ethdev extended statistics */ +void +rte_eth_xstats_reset(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + /* implemented by the driver */ + if (dev->dev_ops->xstats_reset != NULL) { + (*dev->dev_ops->xstats_reset)(dev); + return; + } + + /* fallback to default */ + rte_eth_stats_reset(port_id); +} + +static int +set_queue_stats_mapping(uint16_t port_id, uint16_t queue_id, uint8_t stat_idx, + uint8_t is_rx) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_stats_mapping_set, -ENOTSUP); + + if (is_rx && (queue_id >= dev->data->nb_rx_queues)) + return -EINVAL; + + if (!is_rx && (queue_id >= dev->data->nb_tx_queues)) + return -EINVAL; + + if (stat_idx >= RTE_ETHDEV_QUEUE_STAT_CNTRS) + return -EINVAL; + + return (*dev->dev_ops->queue_stats_mapping_set) + (dev, queue_id, stat_idx, is_rx); +} + + +int +rte_eth_dev_set_tx_queue_stats_mapping(uint16_t port_id, uint16_t tx_queue_id, + uint8_t stat_idx) +{ + return eth_err(port_id, set_queue_stats_mapping(port_id, tx_queue_id, + stat_idx, STAT_QMAP_TX)); +} + + +int +rte_eth_dev_set_rx_queue_stats_mapping(uint16_t port_id, uint16_t rx_queue_id, + uint8_t stat_idx) +{ + return eth_err(port_id, set_queue_stats_mapping(port_id, rx_queue_id, + stat_idx, STAT_QMAP_RX)); +} + +int +rte_eth_dev_fw_version_get(uint16_t port_id, char *fw_version, size_t fw_size) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->fw_version_get, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->fw_version_get)(dev, + fw_version, fw_size)); +} + +void +rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info) +{ + struct rte_eth_dev *dev; + const struct rte_eth_desc_lim lim = { + .nb_max = UINT16_MAX, + .nb_min = 0, + .nb_align = 1, + }; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + + memset(dev_info, 0, sizeof(struct rte_eth_dev_info)); + dev_info->rx_desc_lim = lim; + dev_info->tx_desc_lim = lim; + dev_info->device = dev->device; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get); + (*dev->dev_ops->dev_infos_get)(dev, dev_info); + dev_info->driver_name = dev->device->driver->name; + dev_info->nb_rx_queues = dev->data->nb_rx_queues; + dev_info->nb_tx_queues = dev->data->nb_tx_queues; + + dev_info->dev_flags = &dev->data->dev_flags; +} + +int +rte_eth_dev_get_supported_ptypes(uint16_t port_id, uint32_t ptype_mask, + uint32_t *ptypes, int num) +{ + int i, j; + struct rte_eth_dev *dev; + const uint32_t *all_ptypes; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_supported_ptypes_get, 0); + all_ptypes = (*dev->dev_ops->dev_supported_ptypes_get)(dev); + + if (!all_ptypes) + return 0; + + for (i = 0, j = 0; all_ptypes[i] != RTE_PTYPE_UNKNOWN; ++i) + if (all_ptypes[i] & ptype_mask) { + if (j < num) + ptypes[j] = all_ptypes[i]; + j++; + } + + return j; +} + +void +rte_eth_macaddr_get(uint16_t port_id, struct ether_addr *mac_addr) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_RET(port_id); + dev = &rte_eth_devices[port_id]; + ether_addr_copy(&dev->data->mac_addrs[0], mac_addr); +} + + +int +rte_eth_dev_get_mtu(uint16_t port_id, uint16_t *mtu) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + *mtu = dev->data->mtu; + return 0; +} + +int +rte_eth_dev_set_mtu(uint16_t port_id, uint16_t mtu) +{ + int ret; + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mtu_set, -ENOTSUP); + + ret = (*dev->dev_ops->mtu_set)(dev, mtu); + if (!ret) + dev->data->mtu = mtu; + + return eth_err(port_id, ret); +} + +int +rte_eth_dev_vlan_filter(uint16_t port_id, uint16_t vlan_id, int on) +{ + struct rte_eth_dev *dev; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + if (!(dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_FILTER)) { + RTE_ETHDEV_LOG(ERR, "Port %u: vlan-filtering disabled\n", + port_id); + return -ENOSYS; + } + + if (vlan_id > 4095) { + RTE_ETHDEV_LOG(ERR, "Port_id=%u invalid vlan_id=%u > 4095\n", + port_id, vlan_id); + return -EINVAL; + } + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_filter_set, -ENOTSUP); + + ret = (*dev->dev_ops->vlan_filter_set)(dev, vlan_id, on); + if (ret == 0) { + struct rte_vlan_filter_conf *vfc; + int vidx; + int vbit; + + vfc = &dev->data->vlan_filter_conf; + vidx = vlan_id / 64; + vbit = vlan_id % 64; + + if (on) + vfc->ids[vidx] |= UINT64_C(1) << vbit; + else + vfc->ids[vidx] &= ~(UINT64_C(1) << vbit); + } + + return eth_err(port_id, ret); +} + +int +rte_eth_dev_set_vlan_strip_on_queue(uint16_t port_id, uint16_t rx_queue_id, + int on) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + if (rx_queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid rx_queue_id=%u\n", rx_queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_strip_queue_set, -ENOTSUP); + (*dev->dev_ops->vlan_strip_queue_set)(dev, rx_queue_id, on); + + return 0; +} + +int +rte_eth_dev_set_vlan_ether_type(uint16_t port_id, + enum rte_vlan_type vlan_type, + uint16_t tpid) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_tpid_set, -ENOTSUP); + + return eth_err(port_id, (*dev->dev_ops->vlan_tpid_set)(dev, vlan_type, + tpid)); +} + +int +rte_eth_dev_set_vlan_offload(uint16_t port_id, int offload_mask) +{ + struct rte_eth_dev *dev; + int ret = 0; + int mask = 0; + int cur, org = 0; + uint64_t orig_offloads; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + /* save original values in case of failure */ + orig_offloads = dev->data->dev_conf.rxmode.offloads; + + /*check which option changed by application*/ + cur = !!(offload_mask & ETH_VLAN_STRIP_OFFLOAD); + org = !!(dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_STRIP); + if (cur != org) { + if (cur) + dev->data->dev_conf.rxmode.offloads |= + DEV_RX_OFFLOAD_VLAN_STRIP; + else + dev->data->dev_conf.rxmode.offloads &= + ~DEV_RX_OFFLOAD_VLAN_STRIP; + mask |= ETH_VLAN_STRIP_MASK; + } + + cur = !!(offload_mask & ETH_VLAN_FILTER_OFFLOAD); + org = !!(dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_FILTER); + if (cur != org) { + if (cur) + dev->data->dev_conf.rxmode.offloads |= + DEV_RX_OFFLOAD_VLAN_FILTER; + else + dev->data->dev_conf.rxmode.offloads &= + ~DEV_RX_OFFLOAD_VLAN_FILTER; + mask |= ETH_VLAN_FILTER_MASK; + } + + cur = !!(offload_mask & ETH_VLAN_EXTEND_OFFLOAD); + org = !!(dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_EXTEND); + if (cur != org) { + if (cur) + dev->data->dev_conf.rxmode.offloads |= + DEV_RX_OFFLOAD_VLAN_EXTEND; + else + dev->data->dev_conf.rxmode.offloads &= + ~DEV_RX_OFFLOAD_VLAN_EXTEND; + mask |= ETH_VLAN_EXTEND_MASK; + } + + /*no change*/ + if (mask == 0) + return ret; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_offload_set, -ENOTSUP); + ret = (*dev->dev_ops->vlan_offload_set)(dev, mask); + if (ret) { + /* hit an error restore original values */ + dev->data->dev_conf.rxmode.offloads = orig_offloads; + } + + return eth_err(port_id, ret); +} + +int +rte_eth_dev_get_vlan_offload(uint16_t port_id) +{ + struct rte_eth_dev *dev; + int ret = 0; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + if (dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_STRIP) + ret |= ETH_VLAN_STRIP_OFFLOAD; + + if (dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_FILTER) + ret |= ETH_VLAN_FILTER_OFFLOAD; + + if (dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_VLAN_EXTEND) + ret |= ETH_VLAN_EXTEND_OFFLOAD; + + return ret; +} + +int +rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_pvid_set, -ENOTSUP); + + return eth_err(port_id, (*dev->dev_ops->vlan_pvid_set)(dev, pvid, on)); +} + +int +rte_eth_dev_flow_ctrl_get(uint16_t port_id, struct rte_eth_fc_conf *fc_conf) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->flow_ctrl_get, -ENOTSUP); + memset(fc_conf, 0, sizeof(*fc_conf)); + return eth_err(port_id, (*dev->dev_ops->flow_ctrl_get)(dev, fc_conf)); +} + +int +rte_eth_dev_flow_ctrl_set(uint16_t port_id, struct rte_eth_fc_conf *fc_conf) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if ((fc_conf->send_xon != 0) && (fc_conf->send_xon != 1)) { + RTE_ETHDEV_LOG(ERR, "Invalid send_xon, only 0/1 allowed\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->flow_ctrl_set, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->flow_ctrl_set)(dev, fc_conf)); +} + +int +rte_eth_dev_priority_flow_ctrl_set(uint16_t port_id, + struct rte_eth_pfc_conf *pfc_conf) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if (pfc_conf->priority > (ETH_DCB_NUM_USER_PRIORITIES - 1)) { + RTE_ETHDEV_LOG(ERR, "Invalid priority, only 0-7 allowed\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + /* High water, low water validation are device specific */ + if (*dev->dev_ops->priority_flow_ctrl_set) + return eth_err(port_id, (*dev->dev_ops->priority_flow_ctrl_set) + (dev, pfc_conf)); + return -ENOTSUP; +} + +static int +rte_eth_check_reta_mask(struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size) +{ + uint16_t i, num; + + if (!reta_conf) + return -EINVAL; + + num = (reta_size + RTE_RETA_GROUP_SIZE - 1) / RTE_RETA_GROUP_SIZE; + for (i = 0; i < num; i++) { + if (reta_conf[i].mask) + return 0; + } + + return -EINVAL; +} + +static int +rte_eth_check_reta_entry(struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size, + uint16_t max_rxq) +{ + uint16_t i, idx, shift; + + if (!reta_conf) + return -EINVAL; + + if (max_rxq == 0) { + RTE_ETHDEV_LOG(ERR, "No receive queue is available\n"); + return -EINVAL; + } + + for (i = 0; i < reta_size; i++) { + idx = i / RTE_RETA_GROUP_SIZE; + shift = i % RTE_RETA_GROUP_SIZE; + if ((reta_conf[idx].mask & (1ULL << shift)) && + (reta_conf[idx].reta[shift] >= max_rxq)) { + RTE_ETHDEV_LOG(ERR, + "reta_conf[%u]->reta[%u]: %u exceeds the maximum rxq index: %u\n", + idx, shift, + reta_conf[idx].reta[shift], max_rxq); + return -EINVAL; + } + } + + return 0; +} + +int +rte_eth_dev_rss_reta_update(uint16_t port_id, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size) +{ + struct rte_eth_dev *dev; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + /* Check mask bits */ + ret = rte_eth_check_reta_mask(reta_conf, reta_size); + if (ret < 0) + return ret; + + dev = &rte_eth_devices[port_id]; + + /* Check entry value */ + ret = rte_eth_check_reta_entry(reta_conf, reta_size, + dev->data->nb_rx_queues); + if (ret < 0) + return ret; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->reta_update, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->reta_update)(dev, reta_conf, + reta_size)); +} + +int +rte_eth_dev_rss_reta_query(uint16_t port_id, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size) +{ + struct rte_eth_dev *dev; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + /* Check mask bits */ + ret = rte_eth_check_reta_mask(reta_conf, reta_size); + if (ret < 0) + return ret; + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->reta_query, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->reta_query)(dev, reta_conf, + reta_size)); +} + +int +rte_eth_dev_rss_hash_update(uint16_t port_id, + struct rte_eth_rss_conf *rss_conf) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info = { .flow_type_rss_offloads = 0, }; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + rte_eth_dev_info_get(port_id, &dev_info); + if ((dev_info.flow_type_rss_offloads | rss_conf->rss_hf) != + dev_info.flow_type_rss_offloads) { + RTE_ETHDEV_LOG(ERR, + "Ethdev port_id=%u invalid rss_hf: 0x%"PRIx64", valid value: 0x%"PRIx64"\n", + port_id, rss_conf->rss_hf, + dev_info.flow_type_rss_offloads); + return -EINVAL; + } + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rss_hash_update, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->rss_hash_update)(dev, + rss_conf)); +} + +int +rte_eth_dev_rss_hash_conf_get(uint16_t port_id, + struct rte_eth_rss_conf *rss_conf) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rss_hash_conf_get, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->rss_hash_conf_get)(dev, + rss_conf)); +} + +int +rte_eth_dev_udp_tunnel_port_add(uint16_t port_id, + struct rte_eth_udp_tunnel *udp_tunnel) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if (udp_tunnel == NULL) { + RTE_ETHDEV_LOG(ERR, "Invalid udp_tunnel parameter\n"); + return -EINVAL; + } + + if (udp_tunnel->prot_type >= RTE_TUNNEL_TYPE_MAX) { + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->udp_tunnel_port_add, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->udp_tunnel_port_add)(dev, + udp_tunnel)); +} + +int +rte_eth_dev_udp_tunnel_port_delete(uint16_t port_id, + struct rte_eth_udp_tunnel *udp_tunnel) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + if (udp_tunnel == NULL) { + RTE_ETHDEV_LOG(ERR, "Invalid udp_tunnel parameter\n"); + return -EINVAL; + } + + if (udp_tunnel->prot_type >= RTE_TUNNEL_TYPE_MAX) { + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->udp_tunnel_port_del, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->udp_tunnel_port_del)(dev, + udp_tunnel)); +} + +int +rte_eth_led_on(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_led_on, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->dev_led_on)(dev)); +} + +int +rte_eth_led_off(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_led_off, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->dev_led_off)(dev)); +} + +/* + * Returns index into MAC address array of addr. Use 00:00:00:00:00:00 to find + * an empty spot. + */ +static int +get_mac_addr_index(uint16_t port_id, const struct ether_addr *addr) +{ + struct rte_eth_dev_info dev_info; + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + unsigned i; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + rte_eth_dev_info_get(port_id, &dev_info); + + for (i = 0; i < dev_info.max_mac_addrs; i++) + if (memcmp(addr, &dev->data->mac_addrs[i], ETHER_ADDR_LEN) == 0) + return i; + + return -1; +} + +static const struct ether_addr null_mac_addr; + +int +rte_eth_dev_mac_addr_add(uint16_t port_id, struct ether_addr *addr, + uint32_t pool) +{ + struct rte_eth_dev *dev; + int index; + uint64_t pool_mask; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mac_addr_add, -ENOTSUP); + + if (is_zero_ether_addr(addr)) { + RTE_ETHDEV_LOG(ERR, "Port %u: Cannot add NULL MAC address\n", + port_id); + return -EINVAL; + } + if (pool >= ETH_64_POOLS) { + RTE_ETHDEV_LOG(ERR, "Pool id must be 0-%d\n", ETH_64_POOLS - 1); + return -EINVAL; + } + + index = get_mac_addr_index(port_id, addr); + if (index < 0) { + index = get_mac_addr_index(port_id, &null_mac_addr); + if (index < 0) { + RTE_ETHDEV_LOG(ERR, "Port %u: MAC address array full\n", + port_id); + return -ENOSPC; + } + } else { + pool_mask = dev->data->mac_pool_sel[index]; + + /* Check if both MAC address and pool is already there, and do nothing */ + if (pool_mask & (1ULL << pool)) + return 0; + } + + /* Update NIC */ + ret = (*dev->dev_ops->mac_addr_add)(dev, addr, index, pool); + + if (ret == 0) { + /* Update address in NIC data structure */ + ether_addr_copy(addr, &dev->data->mac_addrs[index]); + + /* Update pool bitmap in NIC data structure */ + dev->data->mac_pool_sel[index] |= (1ULL << pool); + } + + return eth_err(port_id, ret); +} + +int +rte_eth_dev_mac_addr_remove(uint16_t port_id, struct ether_addr *addr) +{ + struct rte_eth_dev *dev; + int index; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mac_addr_remove, -ENOTSUP); + + index = get_mac_addr_index(port_id, addr); + if (index == 0) { + RTE_ETHDEV_LOG(ERR, + "Port %u: Cannot remove default MAC address\n", + port_id); + return -EADDRINUSE; + } else if (index < 0) + return 0; /* Do nothing if address wasn't found */ + + /* Update NIC */ + (*dev->dev_ops->mac_addr_remove)(dev, index); + + /* Update address in NIC data structure */ + ether_addr_copy(&null_mac_addr, &dev->data->mac_addrs[index]); + + /* reset pool bitmap */ + dev->data->mac_pool_sel[index] = 0; + + return 0; +} + +int +rte_eth_dev_default_mac_addr_set(uint16_t port_id, struct ether_addr *addr) +{ + struct rte_eth_dev *dev; + int ret; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (!is_valid_assigned_ether_addr(addr)) + return -EINVAL; + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mac_addr_set, -ENOTSUP); + + ret = (*dev->dev_ops->mac_addr_set)(dev, addr); + if (ret < 0) + return ret; + + /* Update default address in NIC data structure */ + ether_addr_copy(addr, &dev->data->mac_addrs[0]); + + return 0; +} + + +/* + * Returns index into MAC address array of addr. Use 00:00:00:00:00:00 to find + * an empty spot. + */ +static int +get_hash_mac_addr_index(uint16_t port_id, const struct ether_addr *addr) +{ + struct rte_eth_dev_info dev_info; + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + unsigned i; + + rte_eth_dev_info_get(port_id, &dev_info); + if (!dev->data->hash_mac_addrs) + return -1; + + for (i = 0; i < dev_info.max_hash_mac_addrs; i++) + if (memcmp(addr, &dev->data->hash_mac_addrs[i], + ETHER_ADDR_LEN) == 0) + return i; + + return -1; +} + +int +rte_eth_dev_uc_hash_table_set(uint16_t port_id, struct ether_addr *addr, + uint8_t on) +{ + int index; + int ret; + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + if (is_zero_ether_addr(addr)) { + RTE_ETHDEV_LOG(ERR, "Port %u: Cannot add NULL MAC address\n", + port_id); + return -EINVAL; + } + + index = get_hash_mac_addr_index(port_id, addr); + /* Check if it's already there, and do nothing */ + if ((index >= 0) && on) + return 0; + + if (index < 0) { + if (!on) { + RTE_ETHDEV_LOG(ERR, + "Port %u: the MAC address was not set in UTA\n", + port_id); + return -EINVAL; + } + + index = get_hash_mac_addr_index(port_id, &null_mac_addr); + if (index < 0) { + RTE_ETHDEV_LOG(ERR, "Port %u: MAC address array full\n", + port_id); + return -ENOSPC; + } + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->uc_hash_table_set, -ENOTSUP); + ret = (*dev->dev_ops->uc_hash_table_set)(dev, addr, on); + if (ret == 0) { + /* Update address in NIC data structure */ + if (on) + ether_addr_copy(addr, + &dev->data->hash_mac_addrs[index]); + else + ether_addr_copy(&null_mac_addr, + &dev->data->hash_mac_addrs[index]); + } + + return eth_err(port_id, ret); +} + +int +rte_eth_dev_uc_all_hash_table_set(uint16_t port_id, uint8_t on) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->uc_all_hash_table_set, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->uc_all_hash_table_set)(dev, + on)); +} + +int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx, + uint16_t tx_rate) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; + struct rte_eth_link link; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + rte_eth_dev_info_get(port_id, &dev_info); + link = dev->data->dev_link; + + if (queue_idx > dev_info.max_tx_queues) { + RTE_ETHDEV_LOG(ERR, + "Set queue rate limit:port %u: invalid queue id=%u\n", + port_id, queue_idx); + return -EINVAL; + } + + if (tx_rate > link.link_speed) { + RTE_ETHDEV_LOG(ERR, + "Set queue rate limit:invalid tx_rate=%u, bigger than link speed= %d\n", + tx_rate, link.link_speed); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->set_queue_rate_limit, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->set_queue_rate_limit)(dev, + queue_idx, tx_rate)); +} + +int +rte_eth_mirror_rule_set(uint16_t port_id, + struct rte_eth_mirror_conf *mirror_conf, + uint8_t rule_id, uint8_t on) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if (mirror_conf->rule_type == 0) { + RTE_ETHDEV_LOG(ERR, "Mirror rule type can not be 0\n"); + return -EINVAL; + } + + if (mirror_conf->dst_pool >= ETH_64_POOLS) { + RTE_ETHDEV_LOG(ERR, "Invalid dst pool, pool id must be 0-%d\n", + ETH_64_POOLS - 1); + return -EINVAL; + } + + if ((mirror_conf->rule_type & (ETH_MIRROR_VIRTUAL_POOL_UP | + ETH_MIRROR_VIRTUAL_POOL_DOWN)) && + (mirror_conf->pool_mask == 0)) { + RTE_ETHDEV_LOG(ERR, + "Invalid mirror pool, pool mask can not be 0\n"); + return -EINVAL; + } + + if ((mirror_conf->rule_type & ETH_MIRROR_VLAN) && + mirror_conf->vlan.vlan_mask == 0) { + RTE_ETHDEV_LOG(ERR, + "Invalid vlan mask, vlan mask can not be 0\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mirror_rule_set, -ENOTSUP); + + return eth_err(port_id, (*dev->dev_ops->mirror_rule_set)(dev, + mirror_conf, rule_id, on)); +} + +int +rte_eth_mirror_rule_reset(uint16_t port_id, uint8_t rule_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->mirror_rule_reset, -ENOTSUP); + + return eth_err(port_id, (*dev->dev_ops->mirror_rule_reset)(dev, + rule_id)); +} + +RTE_INIT(eth_dev_init_cb_lists) +{ + int i; + + for (i = 0; i < RTE_MAX_ETHPORTS; i++) + TAILQ_INIT(&rte_eth_devices[i].link_intr_cbs); +} + +int +rte_eth_dev_callback_register(uint16_t port_id, + enum rte_eth_event_type event, + rte_eth_dev_cb_fn cb_fn, void *cb_arg) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_callback *user_cb; + uint32_t next_port; /* size is 32-bit to prevent loop wrap-around */ + uint16_t last_port; + + if (!cb_fn) + return -EINVAL; + + if (!rte_eth_dev_is_valid_port(port_id) && port_id != RTE_ETH_ALL) { + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%d\n", port_id); + return -EINVAL; + } + + if (port_id == RTE_ETH_ALL) { + next_port = 0; + last_port = RTE_MAX_ETHPORTS - 1; + } else { + next_port = last_port = port_id; + } + + rte_spinlock_lock(&rte_eth_dev_cb_lock); + + do { + dev = &rte_eth_devices[next_port]; + + TAILQ_FOREACH(user_cb, &(dev->link_intr_cbs), next) { + if (user_cb->cb_fn == cb_fn && + user_cb->cb_arg == cb_arg && + user_cb->event == event) { + break; + } + } + + /* create a new callback. */ + if (user_cb == NULL) { + user_cb = rte_zmalloc("INTR_USER_CALLBACK", + sizeof(struct rte_eth_dev_callback), 0); + if (user_cb != NULL) { + user_cb->cb_fn = cb_fn; + user_cb->cb_arg = cb_arg; + user_cb->event = event; + TAILQ_INSERT_TAIL(&(dev->link_intr_cbs), + user_cb, next); + } else { + rte_spinlock_unlock(&rte_eth_dev_cb_lock); + rte_eth_dev_callback_unregister(port_id, event, + cb_fn, cb_arg); + return -ENOMEM; + } + + } + } while (++next_port <= last_port); + + rte_spinlock_unlock(&rte_eth_dev_cb_lock); + return 0; +} + +int +rte_eth_dev_callback_unregister(uint16_t port_id, + enum rte_eth_event_type event, + rte_eth_dev_cb_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_eth_dev *dev; + struct rte_eth_dev_callback *cb, *next; + uint32_t next_port; /* size is 32-bit to prevent loop wrap-around */ + uint16_t last_port; + + if (!cb_fn) + return -EINVAL; + + if (!rte_eth_dev_is_valid_port(port_id) && port_id != RTE_ETH_ALL) { + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%d\n", port_id); + return -EINVAL; + } + + if (port_id == RTE_ETH_ALL) { + next_port = 0; + last_port = RTE_MAX_ETHPORTS - 1; + } else { + next_port = last_port = port_id; + } + + rte_spinlock_lock(&rte_eth_dev_cb_lock); + + do { + dev = &rte_eth_devices[next_port]; + ret = 0; + for (cb = TAILQ_FIRST(&dev->link_intr_cbs); cb != NULL; + cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn != cb_fn || cb->event != event || + (cb->cb_arg != (void *)-1 && cb->cb_arg != cb_arg)) + continue; + + /* + * if this callback is not executing right now, + * then remove it. + */ + if (cb->active == 0) { + TAILQ_REMOVE(&(dev->link_intr_cbs), cb, next); + rte_free(cb); + } else { + ret = -EAGAIN; + } + } + } while (++next_port <= last_port); + + rte_spinlock_unlock(&rte_eth_dev_cb_lock); + return ret; +} + +int +_rte_eth_dev_callback_process(struct rte_eth_dev *dev, + enum rte_eth_event_type event, void *ret_param) +{ + struct rte_eth_dev_callback *cb_lst; + struct rte_eth_dev_callback dev_cb; + int rc = 0; + + rte_spinlock_lock(&rte_eth_dev_cb_lock); + TAILQ_FOREACH(cb_lst, &(dev->link_intr_cbs), next) { + if (cb_lst->cb_fn == NULL || cb_lst->event != event) + continue; + dev_cb = *cb_lst; + cb_lst->active = 1; + if (ret_param != NULL) + dev_cb.ret_param = ret_param; + + rte_spinlock_unlock(&rte_eth_dev_cb_lock); + rc = dev_cb.cb_fn(dev->data->port_id, dev_cb.event, + dev_cb.cb_arg, dev_cb.ret_param); + rte_spinlock_lock(&rte_eth_dev_cb_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&rte_eth_dev_cb_lock); + return rc; +} + +void +rte_eth_dev_probing_finish(struct rte_eth_dev *dev) +{ + if (dev == NULL) + return; + + _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_NEW, NULL); + + dev->state = RTE_ETH_DEV_ATTACHED; +} + +int +rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data) +{ + uint32_t vec; + struct rte_eth_dev *dev; + struct rte_intr_handle *intr_handle; + uint16_t qid; + int rc; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + + if (!dev->intr_handle) { + RTE_ETHDEV_LOG(ERR, "RX Intr handle unset\n"); + return -ENOTSUP; + } + + intr_handle = dev->intr_handle; + if (!intr_handle->intr_vec) { + RTE_ETHDEV_LOG(ERR, "RX Intr vector unset\n"); + return -EPERM; + } + + for (qid = 0; qid < dev->data->nb_rx_queues; qid++) { + vec = intr_handle->intr_vec[qid]; + rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data); + if (rc && rc != -EEXIST) { + RTE_ETHDEV_LOG(ERR, + "p %u q %u rx ctl error op %d epfd %d vec %u\n", + port_id, qid, op, epfd, vec); + } + } + + return 0; +} + +const struct rte_memzone * +rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name, + uint16_t queue_id, size_t size, unsigned align, + int socket_id) +{ + char z_name[RTE_MEMZONE_NAMESIZE]; + const struct rte_memzone *mz; + + snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d", + dev->device->driver->name, ring_name, + dev->data->port_id, queue_id); + + mz = rte_memzone_lookup(z_name); + if (mz) + return mz; + + return rte_memzone_reserve_aligned(z_name, size, socket_id, + RTE_MEMZONE_IOVA_CONTIG, align); +} + +int __rte_experimental +rte_eth_dev_create(struct rte_device *device, const char *name, + size_t priv_data_size, + ethdev_bus_specific_init ethdev_bus_specific_init, + void *bus_init_params, + ethdev_init_t ethdev_init, void *init_params) +{ + struct rte_eth_dev *ethdev; + int retval; + + RTE_FUNC_PTR_OR_ERR_RET(*ethdev_init, -EINVAL); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ethdev = rte_eth_dev_allocate(name); + if (!ethdev) { + retval = -ENODEV; + goto probe_failed; + } + + if (priv_data_size) { + ethdev->data->dev_private = rte_zmalloc_socket( + name, priv_data_size, RTE_CACHE_LINE_SIZE, + device->numa_node); + + if (!ethdev->data->dev_private) { + RTE_LOG(ERR, EAL, "failed to allocate private data"); + retval = -ENOMEM; + goto probe_failed; + } + } + } else { + ethdev = rte_eth_dev_attach_secondary(name); + if (!ethdev) { + RTE_LOG(ERR, EAL, "secondary process attach failed, " + "ethdev doesn't exist"); + retval = -ENODEV; + goto probe_failed; + } + } + + ethdev->device = device; + + if (ethdev_bus_specific_init) { + retval = ethdev_bus_specific_init(ethdev, bus_init_params); + if (retval) { + RTE_LOG(ERR, EAL, + "ethdev bus specific initialisation failed"); + goto probe_failed; + } + } + + retval = ethdev_init(ethdev, init_params); + if (retval) { + RTE_LOG(ERR, EAL, "ethdev initialisation failed"); + goto probe_failed; + } + + rte_eth_dev_probing_finish(ethdev); + + return retval; +probe_failed: + /* free ports private data if primary process */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_free(ethdev->data->dev_private); + + rte_eth_dev_release_port(ethdev); + + return retval; +} + +int __rte_experimental +rte_eth_dev_destroy(struct rte_eth_dev *ethdev, + ethdev_uninit_t ethdev_uninit) +{ + int ret; + + ethdev = rte_eth_dev_allocated(ethdev->data->name); + if (!ethdev) + return -ENODEV; + + RTE_FUNC_PTR_OR_ERR_RET(*ethdev_uninit, -EINVAL); + if (ethdev_uninit) { + ret = ethdev_uninit(ethdev); + if (ret) + return ret; + } + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_free(ethdev->data->dev_private); + + ethdev->data->dev_private = NULL; + + return rte_eth_dev_release_port(ethdev); +} + +int +rte_eth_dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, + int epfd, int op, void *data) +{ + uint32_t vec; + struct rte_eth_dev *dev; + struct rte_intr_handle *intr_handle; + int rc; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + if (queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); + return -EINVAL; + } + + if (!dev->intr_handle) { + RTE_ETHDEV_LOG(ERR, "RX Intr handle unset\n"); + return -ENOTSUP; + } + + intr_handle = dev->intr_handle; + if (!intr_handle->intr_vec) { + RTE_ETHDEV_LOG(ERR, "RX Intr vector unset\n"); + return -EPERM; + } + + vec = intr_handle->intr_vec[queue_id]; + rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data); + if (rc && rc != -EEXIST) { + RTE_ETHDEV_LOG(ERR, + "p %u q %u rx ctl error op %d epfd %d vec %u\n", + port_id, queue_id, op, epfd, vec); + return rc; + } + + return 0; +} + +int +rte_eth_dev_rx_intr_enable(uint16_t port_id, + uint16_t queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->rx_queue_intr_enable)(dev, + queue_id)); +} + +int +rte_eth_dev_rx_intr_disable(uint16_t port_id, + uint16_t queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->rx_queue_intr_disable)(dev, + queue_id)); +} + + +int +rte_eth_dev_filter_supported(uint16_t port_id, + enum rte_filter_type filter_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->filter_ctrl, -ENOTSUP); + return (*dev->dev_ops->filter_ctrl)(dev, filter_type, + RTE_ETH_FILTER_NOP, NULL); +} + +int +rte_eth_dev_filter_ctrl(uint16_t port_id, enum rte_filter_type filter_type, + enum rte_filter_op filter_op, void *arg) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->filter_ctrl, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->filter_ctrl)(dev, filter_type, + filter_op, arg)); +} + +const struct rte_eth_rxtx_callback * +rte_eth_add_rx_callback(uint16_t port_id, uint16_t queue_id, + rte_rx_callback_fn fn, void *user_param) +{ +#ifndef RTE_ETHDEV_RXTX_CALLBACKS + rte_errno = ENOTSUP; + return NULL; +#endif + /* check input parameters */ + if (!rte_eth_dev_is_valid_port(port_id) || fn == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_rx_queues) { + rte_errno = EINVAL; + return NULL; + } + struct rte_eth_rxtx_callback *cb = rte_zmalloc(NULL, sizeof(*cb), 0); + + if (cb == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + cb->fn.rx = fn; + cb->param = user_param; + + rte_spinlock_lock(&rte_eth_rx_cb_lock); + /* Add the callbacks in fifo order. */ + struct rte_eth_rxtx_callback *tail = + rte_eth_devices[port_id].post_rx_burst_cbs[queue_id]; + + if (!tail) { + rte_eth_devices[port_id].post_rx_burst_cbs[queue_id] = cb; + + } else { + while (tail->next) + tail = tail->next; + tail->next = cb; + } + rte_spinlock_unlock(&rte_eth_rx_cb_lock); + + return cb; +} + +const struct rte_eth_rxtx_callback * +rte_eth_add_first_rx_callback(uint16_t port_id, uint16_t queue_id, + rte_rx_callback_fn fn, void *user_param) +{ +#ifndef RTE_ETHDEV_RXTX_CALLBACKS + rte_errno = ENOTSUP; + return NULL; +#endif + /* check input parameters */ + if (!rte_eth_dev_is_valid_port(port_id) || fn == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_rx_queues) { + rte_errno = EINVAL; + return NULL; + } + + struct rte_eth_rxtx_callback *cb = rte_zmalloc(NULL, sizeof(*cb), 0); + + if (cb == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + cb->fn.rx = fn; + cb->param = user_param; + + rte_spinlock_lock(&rte_eth_rx_cb_lock); + /* Add the callbacks at fisrt position*/ + cb->next = rte_eth_devices[port_id].post_rx_burst_cbs[queue_id]; + rte_smp_wmb(); + rte_eth_devices[port_id].post_rx_burst_cbs[queue_id] = cb; + rte_spinlock_unlock(&rte_eth_rx_cb_lock); + + return cb; +} + +const struct rte_eth_rxtx_callback * +rte_eth_add_tx_callback(uint16_t port_id, uint16_t queue_id, + rte_tx_callback_fn fn, void *user_param) +{ +#ifndef RTE_ETHDEV_RXTX_CALLBACKS + rte_errno = ENOTSUP; + return NULL; +#endif + /* check input parameters */ + if (!rte_eth_dev_is_valid_port(port_id) || fn == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_tx_queues) { + rte_errno = EINVAL; + return NULL; + } + + struct rte_eth_rxtx_callback *cb = rte_zmalloc(NULL, sizeof(*cb), 0); + + if (cb == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + cb->fn.tx = fn; + cb->param = user_param; + + rte_spinlock_lock(&rte_eth_tx_cb_lock); + /* Add the callbacks in fifo order. */ + struct rte_eth_rxtx_callback *tail = + rte_eth_devices[port_id].pre_tx_burst_cbs[queue_id]; + + if (!tail) { + rte_eth_devices[port_id].pre_tx_burst_cbs[queue_id] = cb; + + } else { + while (tail->next) + tail = tail->next; + tail->next = cb; + } + rte_spinlock_unlock(&rte_eth_tx_cb_lock); + + return cb; +} + +int +rte_eth_remove_rx_callback(uint16_t port_id, uint16_t queue_id, + const struct rte_eth_rxtx_callback *user_cb) +{ +#ifndef RTE_ETHDEV_RXTX_CALLBACKS + return -ENOTSUP; +#endif + /* Check input parameters. */ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + if (user_cb == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_rx_queues) + return -EINVAL; + + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + struct rte_eth_rxtx_callback *cb; + struct rte_eth_rxtx_callback **prev_cb; + int ret = -EINVAL; + + rte_spinlock_lock(&rte_eth_rx_cb_lock); + prev_cb = &dev->post_rx_burst_cbs[queue_id]; + for (; *prev_cb != NULL; prev_cb = &cb->next) { + cb = *prev_cb; + if (cb == user_cb) { + /* Remove the user cb from the callback list. */ + *prev_cb = cb->next; + ret = 0; + break; + } + } + rte_spinlock_unlock(&rte_eth_rx_cb_lock); + + return ret; +} + +int +rte_eth_remove_tx_callback(uint16_t port_id, uint16_t queue_id, + const struct rte_eth_rxtx_callback *user_cb) +{ +#ifndef RTE_ETHDEV_RXTX_CALLBACKS + return -ENOTSUP; +#endif + /* Check input parameters. */ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + if (user_cb == NULL || + queue_id >= rte_eth_devices[port_id].data->nb_tx_queues) + return -EINVAL; + + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + int ret = -EINVAL; + struct rte_eth_rxtx_callback *cb; + struct rte_eth_rxtx_callback **prev_cb; + + rte_spinlock_lock(&rte_eth_tx_cb_lock); + prev_cb = &dev->pre_tx_burst_cbs[queue_id]; + for (; *prev_cb != NULL; prev_cb = &cb->next) { + cb = *prev_cb; + if (cb == user_cb) { + /* Remove the user cb from the callback list. */ + *prev_cb = cb->next; + ret = 0; + break; + } + } + rte_spinlock_unlock(&rte_eth_tx_cb_lock); + + return ret; +} + +int +rte_eth_rx_queue_info_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_rxq_info *qinfo) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (qinfo == NULL) + return -EINVAL; + + dev = &rte_eth_devices[port_id]; + if (queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rxq_info_get, -ENOTSUP); + + memset(qinfo, 0, sizeof(*qinfo)); + dev->dev_ops->rxq_info_get(dev, queue_id, qinfo); + return 0; +} + +int +rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_txq_info *qinfo) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (qinfo == NULL) + return -EINVAL; + + dev = &rte_eth_devices[port_id]; + if (queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->txq_info_get, -ENOTSUP); + + memset(qinfo, 0, sizeof(*qinfo)); + dev->dev_ops->txq_info_get(dev, queue_id, qinfo); + + return 0; +} + +int +rte_eth_dev_set_mc_addr_list(uint16_t port_id, + struct ether_addr *mc_addr_set, + uint32_t nb_mc_addr) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->set_mc_addr_list, -ENOTSUP); + return eth_err(port_id, dev->dev_ops->set_mc_addr_list(dev, + mc_addr_set, nb_mc_addr)); +} + +int +rte_eth_timesync_enable(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_enable, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_enable)(dev)); +} + +int +rte_eth_timesync_disable(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_disable, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_disable)(dev)); +} + +int +rte_eth_timesync_read_rx_timestamp(uint16_t port_id, struct timespec *timestamp, + uint32_t flags) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_read_rx_timestamp, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_read_rx_timestamp) + (dev, timestamp, flags)); +} + +int +rte_eth_timesync_read_tx_timestamp(uint16_t port_id, + struct timespec *timestamp) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_read_tx_timestamp, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_read_tx_timestamp) + (dev, timestamp)); +} + +int +rte_eth_timesync_adjust_time(uint16_t port_id, int64_t delta) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_adjust_time, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_adjust_time)(dev, + delta)); +} + +int +rte_eth_timesync_read_time(uint16_t port_id, struct timespec *timestamp) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_read_time, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_read_time)(dev, + timestamp)); +} + +int +rte_eth_timesync_write_time(uint16_t port_id, const struct timespec *timestamp) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_write_time, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->timesync_write_time)(dev, + timestamp)); +} + +int +rte_eth_dev_get_reg_info(uint16_t port_id, struct rte_dev_reg_info *info) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_reg, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->get_reg)(dev, info)); +} + +int +rte_eth_dev_get_eeprom_length(uint16_t port_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_eeprom_length, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->get_eeprom_length)(dev)); +} + +int +rte_eth_dev_get_eeprom(uint16_t port_id, struct rte_dev_eeprom_info *info) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_eeprom, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->get_eeprom)(dev, info)); +} + +int +rte_eth_dev_set_eeprom(uint16_t port_id, struct rte_dev_eeprom_info *info) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->set_eeprom, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->set_eeprom)(dev, info)); +} + +int __rte_experimental +rte_eth_dev_get_module_info(uint16_t port_id, + struct rte_eth_dev_module_info *modinfo) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_module_info, -ENOTSUP); + return (*dev->dev_ops->get_module_info)(dev, modinfo); +} + +int __rte_experimental +rte_eth_dev_get_module_eeprom(uint16_t port_id, + struct rte_dev_eeprom_info *info) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_module_eeprom, -ENOTSUP); + return (*dev->dev_ops->get_module_eeprom)(dev, info); +} + +int +rte_eth_dev_get_dcb_info(uint16_t port_id, + struct rte_eth_dcb_info *dcb_info) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + memset(dcb_info, 0, sizeof(struct rte_eth_dcb_info)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_dcb_info, -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->get_dcb_info)(dev, dcb_info)); +} + +int +rte_eth_dev_l2_tunnel_eth_type_conf(uint16_t port_id, + struct rte_eth_l2_tunnel_conf *l2_tunnel) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if (l2_tunnel == NULL) { + RTE_ETHDEV_LOG(ERR, "Invalid l2_tunnel parameter\n"); + return -EINVAL; + } + + if (l2_tunnel->l2_tunnel_type >= RTE_TUNNEL_TYPE_MAX) { + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_eth_type_conf, + -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->l2_tunnel_eth_type_conf)(dev, + l2_tunnel)); +} + +int +rte_eth_dev_l2_tunnel_offload_set(uint16_t port_id, + struct rte_eth_l2_tunnel_conf *l2_tunnel, + uint32_t mask, + uint8_t en) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel == NULL) { + RTE_ETHDEV_LOG(ERR, "Invalid l2_tunnel parameter\n"); + return -EINVAL; + } + + if (l2_tunnel->l2_tunnel_type >= RTE_TUNNEL_TYPE_MAX) { + RTE_ETHDEV_LOG(ERR, "Invalid tunnel type\n"); + return -EINVAL; + } + + if (mask == 0) { + RTE_ETHDEV_LOG(ERR, "Mask should have a value\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_offload_set, + -ENOTSUP); + return eth_err(port_id, (*dev->dev_ops->l2_tunnel_offload_set)(dev, + l2_tunnel, mask, en)); +} + +static void +rte_eth_dev_adjust_nb_desc(uint16_t *nb_desc, + const struct rte_eth_desc_lim *desc_lim) +{ + if (desc_lim->nb_align != 0) + *nb_desc = RTE_ALIGN_CEIL(*nb_desc, desc_lim->nb_align); + + if (desc_lim->nb_max != 0) + *nb_desc = RTE_MIN(*nb_desc, desc_lim->nb_max); + + *nb_desc = RTE_MAX(*nb_desc, desc_lim->nb_min); +} + +int +rte_eth_dev_adjust_nb_rx_tx_desc(uint16_t port_id, + uint16_t *nb_rx_desc, + uint16_t *nb_tx_desc) +{ + struct rte_eth_dev *dev; + struct rte_eth_dev_info dev_info; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + + rte_eth_dev_info_get(port_id, &dev_info); + + if (nb_rx_desc != NULL) + rte_eth_dev_adjust_nb_desc(nb_rx_desc, &dev_info.rx_desc_lim); + + if (nb_tx_desc != NULL) + rte_eth_dev_adjust_nb_desc(nb_tx_desc, &dev_info.tx_desc_lim); + + return 0; +} + +int +rte_eth_dev_pool_ops_supported(uint16_t port_id, const char *pool) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (pool == NULL) + return -EINVAL; + + dev = &rte_eth_devices[port_id]; + + if (*dev->dev_ops->pool_ops_supported == NULL) + return 1; /* all pools are supported */ + + return (*dev->dev_ops->pool_ops_supported)(dev, pool); +} + +/** + * A set of values to describe the possible states of a switch domain. + */ +enum rte_eth_switch_domain_state { + RTE_ETH_SWITCH_DOMAIN_UNUSED = 0, + RTE_ETH_SWITCH_DOMAIN_ALLOCATED +}; + +/** + * Array of switch domains available for allocation. Array is sized to + * RTE_MAX_ETHPORTS elements as there cannot be more active switch domains than + * ethdev ports in a single process. + */ +struct rte_eth_dev_switch { + enum rte_eth_switch_domain_state state; +} rte_eth_switch_domains[RTE_MAX_ETHPORTS]; + +int __rte_experimental +rte_eth_switch_domain_alloc(uint16_t *domain_id) +{ + unsigned int i; + + *domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + + for (i = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID + 1; + i < RTE_MAX_ETHPORTS; i++) { + if (rte_eth_switch_domains[i].state == + RTE_ETH_SWITCH_DOMAIN_UNUSED) { + rte_eth_switch_domains[i].state = + RTE_ETH_SWITCH_DOMAIN_ALLOCATED; + *domain_id = i; + return 0; + } + } + + return -ENOSPC; +} + +int __rte_experimental +rte_eth_switch_domain_free(uint16_t domain_id) +{ + if (domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID || + domain_id >= RTE_MAX_ETHPORTS) + return -EINVAL; + + if (rte_eth_switch_domains[domain_id].state != + RTE_ETH_SWITCH_DOMAIN_ALLOCATED) + return -EINVAL; + + rte_eth_switch_domains[domain_id].state = RTE_ETH_SWITCH_DOMAIN_UNUSED; + + return 0; +} + +typedef int (*rte_eth_devargs_callback_t)(char *str, void *data); + +static int +rte_eth_devargs_tokenise(struct rte_kvargs *arglist, const char *str_in) +{ + int state; + struct rte_kvargs_pair *pair; + char *letter; + + arglist->str = strdup(str_in); + if (arglist->str == NULL) + return -ENOMEM; + + letter = arglist->str; + state = 0; + arglist->count = 0; + pair = &arglist->pairs[0]; + while (1) { + switch (state) { + case 0: /* Initial */ + if (*letter == '=') + return -EINVAL; + else if (*letter == '\0') + return 0; + + state = 1; + pair->key = letter; + /* fall-thru */ + + case 1: /* Parsing key */ + if (*letter == '=') { + *letter = '\0'; + pair->value = letter + 1; + state = 2; + } else if (*letter == ',' || *letter == '\0') + return -EINVAL; + break; + + + case 2: /* Parsing value */ + if (*letter == '[') + state = 3; + else if (*letter == ',') { + *letter = '\0'; + arglist->count++; + pair = &arglist->pairs[arglist->count]; + state = 0; + } else if (*letter == '\0') { + letter--; + arglist->count++; + pair = &arglist->pairs[arglist->count]; + state = 0; + } + break; + + case 3: /* Parsing list */ + if (*letter == ']') + state = 2; + else if (*letter == '\0') + return -EINVAL; + break; + } + letter++; + } +} + +static int +rte_eth_devargs_parse_list(char *str, rte_eth_devargs_callback_t callback, + void *data) +{ + char *str_start; + int state; + int result; + + if (*str != '[') + /* Single element, not a list */ + return callback(str, data); + + /* Sanity check, then strip the brackets */ + str_start = &str[strlen(str) - 1]; + if (*str_start != ']') { + RTE_LOG(ERR, EAL, "(%s): List does not end with ']'", str); + return -EINVAL; + } + str++; + *str_start = '\0'; + + /* Process list elements */ + state = 0; + while (1) { + if (state == 0) { + if (*str == '\0') + break; + if (*str != ',') { + str_start = str; + state = 1; + } + } else if (state == 1) { + if (*str == ',' || *str == '\0') { + if (str > str_start) { + /* Non-empty string fragment */ + *str = '\0'; + result = callback(str_start, data); + if (result < 0) + return result; + } + state = 0; + } + } + str++; + } + return 0; +} + +static int +rte_eth_devargs_process_range(char *str, uint16_t *list, uint16_t *len_list, + const uint16_t max_list) +{ + uint16_t lo, hi, val; + int result; + + result = sscanf(str, "%hu-%hu", &lo, &hi); + if (result == 1) { + if (*len_list >= max_list) + return -ENOMEM; + list[(*len_list)++] = lo; + } else if (result == 2) { + if (lo >= hi || lo > RTE_MAX_ETHPORTS || hi > RTE_MAX_ETHPORTS) + return -EINVAL; + for (val = lo; val <= hi; val++) { + if (*len_list >= max_list) + return -ENOMEM; + list[(*len_list)++] = val; + } + } else + return -EINVAL; + return 0; +} + + +static int +rte_eth_devargs_parse_representor_ports(char *str, void *data) +{ + struct rte_eth_devargs *eth_da = data; + + return rte_eth_devargs_process_range(str, eth_da->representor_ports, + ð_da->nb_representor_ports, RTE_MAX_ETHPORTS); +} + +int __rte_experimental +rte_eth_devargs_parse(const char *dargs, struct rte_eth_devargs *eth_da) +{ + struct rte_kvargs args; + struct rte_kvargs_pair *pair; + unsigned int i; + int result = 0; + + memset(eth_da, 0, sizeof(*eth_da)); + + result = rte_eth_devargs_tokenise(&args, dargs); + if (result < 0) + goto parse_cleanup; + + for (i = 0; i < args.count; i++) { + pair = &args.pairs[i]; + if (strcmp("representor", pair->key) == 0) { + result = rte_eth_devargs_parse_list(pair->value, + rte_eth_devargs_parse_representor_ports, + eth_da); + if (result < 0) + goto parse_cleanup; + } + } + +parse_cleanup: + if (args.str) + free(args.str); + + return result; +} + +RTE_INIT(ethdev_init_log) +{ + rte_eth_dev_logtype = rte_log_register("lib.ethdev"); + if (rte_eth_dev_logtype >= 0) + rte_log_set_level(rte_eth_dev_logtype, RTE_LOG_INFO); +} diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.h b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.h new file mode 100644 index 00000000..7070e9ab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev.h @@ -0,0 +1,4298 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_ETHDEV_H_ +#define _RTE_ETHDEV_H_ + +/** + * @file + * + * RTE Ethernet Device API + * + * The Ethernet Device API is composed of two parts: + * + * - The application-oriented Ethernet API that includes functions to setup + * an Ethernet device (configure it, setup its RX and TX queues and start it), + * to get its MAC address, the speed and the status of its physical link, + * to receive and to transmit packets, and so on. + * + * - The driver-oriented Ethernet API that exports functions allowing + * an Ethernet Poll Mode Driver (PMD) to allocate an Ethernet device instance, + * create memzone for HW rings and process registered callbacks, and so on. + * PMDs should include rte_ethdev_driver.h instead of this header. + * + * By default, all the functions of the Ethernet Device API exported by a PMD + * are lock-free functions which assume to not be invoked in parallel on + * different logical cores to work on the same target object. For instance, + * the receive function of a PMD cannot be invoked in parallel on two logical + * cores to poll the same RX queue [of the same port]. Of course, this function + * can be invoked in parallel by different logical cores on different RX queues. + * It is the responsibility of the upper level application to enforce this rule. + * + * If needed, parallel accesses by multiple logical cores to shared queues + * shall be explicitly protected by dedicated inline lock-aware functions + * built on top of their corresponding lock-free functions of the PMD API. + * + * In all functions of the Ethernet API, the Ethernet device is + * designated by an integer >= 0 named the device port identifier. + * + * At the Ethernet driver level, Ethernet devices are represented by a generic + * data structure of type *rte_eth_dev*. + * + * Ethernet devices are dynamically registered during the PCI probing phase + * performed at EAL initialization time. + * When an Ethernet device is being probed, an *rte_eth_dev* structure and + * a new port identifier are allocated for that device. Then, the eth_dev_init() + * function supplied by the Ethernet driver matching the probed PCI + * device is invoked to properly initialize the device. + * + * The role of the device init function consists of resetting the hardware, + * checking access to Non-volatile Memory (NVM), reading the MAC address + * from NVM etc. + * + * If the device init operation is successful, the correspondence between + * the port identifier assigned to the new device and its associated + * *rte_eth_dev* structure is effectively registered. + * Otherwise, both the *rte_eth_dev* structure and the port identifier are + * freed. + * + * The functions exported by the application Ethernet API to setup a device + * designated by its port identifier must be invoked in the following order: + * - rte_eth_dev_configure() + * - rte_eth_tx_queue_setup() + * - rte_eth_rx_queue_setup() + * - rte_eth_dev_start() + * + * Then, the network application can invoke, in any order, the functions + * exported by the Ethernet API to get the MAC address of a given device, to + * get the speed and the status of a device physical link, to receive/transmit + * [burst of] packets, and so on. + * + * If the application wants to change the configuration (i.e. call + * rte_eth_dev_configure(), rte_eth_tx_queue_setup(), or + * rte_eth_rx_queue_setup()), it must call rte_eth_dev_stop() first to stop the + * device and then do the reconfiguration before calling rte_eth_dev_start() + * again. The transmit and receive functions should not be invoked when the + * device is stopped. + * + * Please note that some configuration is not stored between calls to + * rte_eth_dev_stop()/rte_eth_dev_start(). The following configuration will + * be retained: + * + * - flow control settings + * - receive mode configuration (promiscuous mode, hardware checksum mode, + * RSS/VMDQ settings etc.) + * - VLAN filtering configuration + * - MAC addresses supplied to MAC address array + * - flow director filtering mode (but not filtering rules) + * - NIC queue statistics mappings + * + * Any other configuration will not be stored and will need to be re-entered + * before a call to rte_eth_dev_start(). + * + * Finally, a network application can close an Ethernet device by invoking the + * rte_eth_dev_close() function. + * + * Each function of the application Ethernet API invokes a specific function + * of the PMD that controls the target device designated by its port + * identifier. + * For this purpose, all device-specific functions of an Ethernet driver are + * supplied through a set of pointers contained in a generic structure of type + * *eth_dev_ops*. + * The address of the *eth_dev_ops* structure is stored in the *rte_eth_dev* + * structure by the device init function of the Ethernet driver, which is + * invoked during the PCI probing phase, as explained earlier. + * + * In other words, each function of the Ethernet API simply retrieves the + * *rte_eth_dev* structure associated with the device port identifier and + * performs an indirect invocation of the corresponding driver function + * supplied in the *eth_dev_ops* structure of the *rte_eth_dev* structure. + * + * For performance reasons, the address of the burst-oriented RX and TX + * functions of the Ethernet driver are not contained in the *eth_dev_ops* + * structure. Instead, they are directly stored at the beginning of the + * *rte_eth_dev* structure to avoid an extra indirect memory access during + * their invocation. + * + * RTE ethernet device drivers do not use interrupts for transmitting or + * receiving. Instead, Ethernet drivers export Poll-Mode receive and transmit + * functions to applications. + * Both receive and transmit functions are packet-burst oriented to minimize + * their cost per packet through the following optimizations: + * + * - Sharing among multiple packets the incompressible cost of the + * invocation of receive/transmit functions. + * + * - Enabling receive/transmit functions to take advantage of burst-oriented + * hardware features (L1 cache, prefetch instructions, NIC head/tail + * registers) to minimize the number of CPU cycles per packet, for instance, + * by avoiding useless read memory accesses to ring descriptors, or by + * systematically using arrays of pointers that exactly fit L1 cache line + * boundaries and sizes. + * + * The burst-oriented receive function does not provide any error notification, + * to avoid the corresponding overhead. As a hint, the upper-level application + * might check the status of the device link once being systematically returned + * a 0 value by the receive function of the driver for a given number of tries. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Use this macro to check if LRO API is supported */ +#define RTE_ETHDEV_HAS_LRO_SUPPORT + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_ether.h" +#include "rte_eth_ctrl.h" +#include "rte_dev_info.h" + +extern int rte_eth_dev_logtype; + +#define RTE_ETHDEV_LOG(level, ...) \ + rte_log(RTE_LOG_ ## level, rte_eth_dev_logtype, "" __VA_ARGS__) + +struct rte_mbuf; + +/** + * A structure used to retrieve statistics for an Ethernet port. + * Not all statistics fields in struct rte_eth_stats are supported + * by any type of network interface card (NIC). If any statistics + * field is not supported, its value is 0. + */ +struct rte_eth_stats { + uint64_t ipackets; /**< Total number of successfully received packets. */ + uint64_t opackets; /**< Total number of successfully transmitted packets.*/ + uint64_t ibytes; /**< Total number of successfully received bytes. */ + uint64_t obytes; /**< Total number of successfully transmitted bytes. */ + uint64_t imissed; + /**< Total of RX packets dropped by the HW, + * because there are no available buffer (i.e. RX queues are full). + */ + uint64_t ierrors; /**< Total number of erroneous received packets. */ + uint64_t oerrors; /**< Total number of failed transmitted packets. */ + uint64_t rx_nombuf; /**< Total number of RX mbuf allocation failures. */ + uint64_t q_ipackets[RTE_ETHDEV_QUEUE_STAT_CNTRS]; + /**< Total number of queue RX packets. */ + uint64_t q_opackets[RTE_ETHDEV_QUEUE_STAT_CNTRS]; + /**< Total number of queue TX packets. */ + uint64_t q_ibytes[RTE_ETHDEV_QUEUE_STAT_CNTRS]; + /**< Total number of successfully received queue bytes. */ + uint64_t q_obytes[RTE_ETHDEV_QUEUE_STAT_CNTRS]; + /**< Total number of successfully transmitted queue bytes. */ + uint64_t q_errors[RTE_ETHDEV_QUEUE_STAT_CNTRS]; + /**< Total number of queue packets received that are dropped. */ +}; + +/** + * Device supported speeds bitmap flags + */ +#define ETH_LINK_SPEED_AUTONEG (0 << 0) /**< Autonegotiate (all speeds) */ +#define ETH_LINK_SPEED_FIXED (1 << 0) /**< Disable autoneg (fixed speed) */ +#define ETH_LINK_SPEED_10M_HD (1 << 1) /**< 10 Mbps half-duplex */ +#define ETH_LINK_SPEED_10M (1 << 2) /**< 10 Mbps full-duplex */ +#define ETH_LINK_SPEED_100M_HD (1 << 3) /**< 100 Mbps half-duplex */ +#define ETH_LINK_SPEED_100M (1 << 4) /**< 100 Mbps full-duplex */ +#define ETH_LINK_SPEED_1G (1 << 5) /**< 1 Gbps */ +#define ETH_LINK_SPEED_2_5G (1 << 6) /**< 2.5 Gbps */ +#define ETH_LINK_SPEED_5G (1 << 7) /**< 5 Gbps */ +#define ETH_LINK_SPEED_10G (1 << 8) /**< 10 Gbps */ +#define ETH_LINK_SPEED_20G (1 << 9) /**< 20 Gbps */ +#define ETH_LINK_SPEED_25G (1 << 10) /**< 25 Gbps */ +#define ETH_LINK_SPEED_40G (1 << 11) /**< 40 Gbps */ +#define ETH_LINK_SPEED_50G (1 << 12) /**< 50 Gbps */ +#define ETH_LINK_SPEED_56G (1 << 13) /**< 56 Gbps */ +#define ETH_LINK_SPEED_100G (1 << 14) /**< 100 Gbps */ + +/** + * Ethernet numeric link speeds in Mbps + */ +#define ETH_SPEED_NUM_NONE 0 /**< Not defined */ +#define ETH_SPEED_NUM_10M 10 /**< 10 Mbps */ +#define ETH_SPEED_NUM_100M 100 /**< 100 Mbps */ +#define ETH_SPEED_NUM_1G 1000 /**< 1 Gbps */ +#define ETH_SPEED_NUM_2_5G 2500 /**< 2.5 Gbps */ +#define ETH_SPEED_NUM_5G 5000 /**< 5 Gbps */ +#define ETH_SPEED_NUM_10G 10000 /**< 10 Gbps */ +#define ETH_SPEED_NUM_20G 20000 /**< 20 Gbps */ +#define ETH_SPEED_NUM_25G 25000 /**< 25 Gbps */ +#define ETH_SPEED_NUM_40G 40000 /**< 40 Gbps */ +#define ETH_SPEED_NUM_50G 50000 /**< 50 Gbps */ +#define ETH_SPEED_NUM_56G 56000 /**< 56 Gbps */ +#define ETH_SPEED_NUM_100G 100000 /**< 100 Gbps */ + +/** + * A structure used to retrieve link-level information of an Ethernet port. + */ +__extension__ +struct rte_eth_link { + uint32_t link_speed; /**< ETH_SPEED_NUM_ */ + uint16_t link_duplex : 1; /**< ETH_LINK_[HALF/FULL]_DUPLEX */ + uint16_t link_autoneg : 1; /**< ETH_LINK_[AUTONEG/FIXED] */ + uint16_t link_status : 1; /**< ETH_LINK_[DOWN/UP] */ +} __attribute__((aligned(8))); /**< aligned for atomic64 read/write */ + +/* Utility constants */ +#define ETH_LINK_HALF_DUPLEX 0 /**< Half-duplex connection (see link_duplex). */ +#define ETH_LINK_FULL_DUPLEX 1 /**< Full-duplex connection (see link_duplex). */ +#define ETH_LINK_DOWN 0 /**< Link is down (see link_status). */ +#define ETH_LINK_UP 1 /**< Link is up (see link_status). */ +#define ETH_LINK_FIXED 0 /**< No autonegotiation (see link_autoneg). */ +#define ETH_LINK_AUTONEG 1 /**< Autonegotiated (see link_autoneg). */ + +/** + * A structure used to configure the ring threshold registers of an RX/TX + * queue for an Ethernet port. + */ +struct rte_eth_thresh { + uint8_t pthresh; /**< Ring prefetch threshold. */ + uint8_t hthresh; /**< Ring host threshold. */ + uint8_t wthresh; /**< Ring writeback threshold. */ +}; + +/** + * Simple flags are used for rte_eth_conf.rxmode.mq_mode. + */ +#define ETH_MQ_RX_RSS_FLAG 0x1 +#define ETH_MQ_RX_DCB_FLAG 0x2 +#define ETH_MQ_RX_VMDQ_FLAG 0x4 + +/** + * A set of values to identify what method is to be used to route + * packets to multiple queues. + */ +enum rte_eth_rx_mq_mode { + /** None of DCB,RSS or VMDQ mode */ + ETH_MQ_RX_NONE = 0, + + /** For RX side, only RSS is on */ + ETH_MQ_RX_RSS = ETH_MQ_RX_RSS_FLAG, + /** For RX side,only DCB is on. */ + ETH_MQ_RX_DCB = ETH_MQ_RX_DCB_FLAG, + /** Both DCB and RSS enable */ + ETH_MQ_RX_DCB_RSS = ETH_MQ_RX_RSS_FLAG | ETH_MQ_RX_DCB_FLAG, + + /** Only VMDQ, no RSS nor DCB */ + ETH_MQ_RX_VMDQ_ONLY = ETH_MQ_RX_VMDQ_FLAG, + /** RSS mode with VMDQ */ + ETH_MQ_RX_VMDQ_RSS = ETH_MQ_RX_RSS_FLAG | ETH_MQ_RX_VMDQ_FLAG, + /** Use VMDQ+DCB to route traffic to queues */ + ETH_MQ_RX_VMDQ_DCB = ETH_MQ_RX_VMDQ_FLAG | ETH_MQ_RX_DCB_FLAG, + /** Enable both VMDQ and DCB in VMDq */ + ETH_MQ_RX_VMDQ_DCB_RSS = ETH_MQ_RX_RSS_FLAG | ETH_MQ_RX_DCB_FLAG | + ETH_MQ_RX_VMDQ_FLAG, +}; + +/** + * for rx mq mode backward compatible + */ +#define ETH_RSS ETH_MQ_RX_RSS +#define VMDQ_DCB ETH_MQ_RX_VMDQ_DCB +#define ETH_DCB_RX ETH_MQ_RX_DCB + +/** + * A set of values to identify what method is to be used to transmit + * packets using multi-TCs. + */ +enum rte_eth_tx_mq_mode { + ETH_MQ_TX_NONE = 0, /**< It is in neither DCB nor VT mode. */ + ETH_MQ_TX_DCB, /**< For TX side,only DCB is on. */ + ETH_MQ_TX_VMDQ_DCB, /**< For TX side,both DCB and VT is on. */ + ETH_MQ_TX_VMDQ_ONLY, /**< Only VT on, no DCB */ +}; + +/** + * for tx mq mode backward compatible + */ +#define ETH_DCB_NONE ETH_MQ_TX_NONE +#define ETH_VMDQ_DCB_TX ETH_MQ_TX_VMDQ_DCB +#define ETH_DCB_TX ETH_MQ_TX_DCB + +/** + * A structure used to configure the RX features of an Ethernet port. + */ +struct rte_eth_rxmode { + /** The multi-queue packet distribution mode to be used, e.g. RSS. */ + enum rte_eth_rx_mq_mode mq_mode; + uint32_t max_rx_pkt_len; /**< Only used if JUMBO_FRAME enabled. */ + uint16_t split_hdr_size; /**< hdr buf size (header_split enabled).*/ + /** + * Per-port Rx offloads to be set using DEV_RX_OFFLOAD_* flags. + * Only offloads set on rx_offload_capa field on rte_eth_dev_info + * structure are allowed to be set. + */ + uint64_t offloads; +}; + +/** + * VLAN types to indicate if it is for single VLAN, inner VLAN or outer VLAN. + * Note that single VLAN is treated the same as inner VLAN. + */ +enum rte_vlan_type { + ETH_VLAN_TYPE_UNKNOWN = 0, + ETH_VLAN_TYPE_INNER, /**< Inner VLAN. */ + ETH_VLAN_TYPE_OUTER, /**< Single VLAN, or outer VLAN. */ + ETH_VLAN_TYPE_MAX, +}; + +/** + * A structure used to describe a vlan filter. + * If the bit corresponding to a VID is set, such VID is on. + */ +struct rte_vlan_filter_conf { + uint64_t ids[64]; +}; + +/** + * A structure used to configure the Receive Side Scaling (RSS) feature + * of an Ethernet port. + * If not NULL, the *rss_key* pointer of the *rss_conf* structure points + * to an array holding the RSS key to use for hashing specific header + * fields of received packets. The length of this array should be indicated + * by *rss_key_len* below. Otherwise, a default random hash key is used by + * the device driver. + * + * The *rss_key_len* field of the *rss_conf* structure indicates the length + * in bytes of the array pointed by *rss_key*. To be compatible, this length + * will be checked in i40e only. Others assume 40 bytes to be used as before. + * + * The *rss_hf* field of the *rss_conf* structure indicates the different + * types of IPv4/IPv6 packets to which the RSS hashing must be applied. + * Supplying an *rss_hf* equal to zero disables the RSS feature. + */ +struct rte_eth_rss_conf { + uint8_t *rss_key; /**< If not NULL, 40-byte hash key. */ + uint8_t rss_key_len; /**< hash key length in bytes. */ + uint64_t rss_hf; /**< Hash functions to apply - see below. */ +}; + +/* + * The RSS offload types are defined based on flow types which are defined + * in rte_eth_ctrl.h. Different NIC hardwares may support different RSS offload + * types. The supported flow types or RSS offload types can be queried by + * rte_eth_dev_info_get(). + */ +#define ETH_RSS_IPV4 (1ULL << RTE_ETH_FLOW_IPV4) +#define ETH_RSS_FRAG_IPV4 (1ULL << RTE_ETH_FLOW_FRAG_IPV4) +#define ETH_RSS_NONFRAG_IPV4_TCP (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_TCP) +#define ETH_RSS_NONFRAG_IPV4_UDP (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_UDP) +#define ETH_RSS_NONFRAG_IPV4_SCTP (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_SCTP) +#define ETH_RSS_NONFRAG_IPV4_OTHER (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_OTHER) +#define ETH_RSS_IPV6 (1ULL << RTE_ETH_FLOW_IPV6) +#define ETH_RSS_FRAG_IPV6 (1ULL << RTE_ETH_FLOW_FRAG_IPV6) +#define ETH_RSS_NONFRAG_IPV6_TCP (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_TCP) +#define ETH_RSS_NONFRAG_IPV6_UDP (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_UDP) +#define ETH_RSS_NONFRAG_IPV6_SCTP (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_SCTP) +#define ETH_RSS_NONFRAG_IPV6_OTHER (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_OTHER) +#define ETH_RSS_L2_PAYLOAD (1ULL << RTE_ETH_FLOW_L2_PAYLOAD) +#define ETH_RSS_IPV6_EX (1ULL << RTE_ETH_FLOW_IPV6_EX) +#define ETH_RSS_IPV6_TCP_EX (1ULL << RTE_ETH_FLOW_IPV6_TCP_EX) +#define ETH_RSS_IPV6_UDP_EX (1ULL << RTE_ETH_FLOW_IPV6_UDP_EX) +#define ETH_RSS_PORT (1ULL << RTE_ETH_FLOW_PORT) +#define ETH_RSS_VXLAN (1ULL << RTE_ETH_FLOW_VXLAN) +#define ETH_RSS_GENEVE (1ULL << RTE_ETH_FLOW_GENEVE) +#define ETH_RSS_NVGRE (1ULL << RTE_ETH_FLOW_NVGRE) + +#define ETH_RSS_IP ( \ + ETH_RSS_IPV4 | \ + ETH_RSS_FRAG_IPV4 | \ + ETH_RSS_NONFRAG_IPV4_OTHER | \ + ETH_RSS_IPV6 | \ + ETH_RSS_FRAG_IPV6 | \ + ETH_RSS_NONFRAG_IPV6_OTHER | \ + ETH_RSS_IPV6_EX) + +#define ETH_RSS_UDP ( \ + ETH_RSS_NONFRAG_IPV4_UDP | \ + ETH_RSS_NONFRAG_IPV6_UDP | \ + ETH_RSS_IPV6_UDP_EX) + +#define ETH_RSS_TCP ( \ + ETH_RSS_NONFRAG_IPV4_TCP | \ + ETH_RSS_NONFRAG_IPV6_TCP | \ + ETH_RSS_IPV6_TCP_EX) + +#define ETH_RSS_SCTP ( \ + ETH_RSS_NONFRAG_IPV4_SCTP | \ + ETH_RSS_NONFRAG_IPV6_SCTP) + +#define ETH_RSS_TUNNEL ( \ + ETH_RSS_VXLAN | \ + ETH_RSS_GENEVE | \ + ETH_RSS_NVGRE) + +/**< Mask of valid RSS hash protocols */ +#define ETH_RSS_PROTO_MASK ( \ + ETH_RSS_IPV4 | \ + ETH_RSS_FRAG_IPV4 | \ + ETH_RSS_NONFRAG_IPV4_TCP | \ + ETH_RSS_NONFRAG_IPV4_UDP | \ + ETH_RSS_NONFRAG_IPV4_SCTP | \ + ETH_RSS_NONFRAG_IPV4_OTHER | \ + ETH_RSS_IPV6 | \ + ETH_RSS_FRAG_IPV6 | \ + ETH_RSS_NONFRAG_IPV6_TCP | \ + ETH_RSS_NONFRAG_IPV6_UDP | \ + ETH_RSS_NONFRAG_IPV6_SCTP | \ + ETH_RSS_NONFRAG_IPV6_OTHER | \ + ETH_RSS_L2_PAYLOAD | \ + ETH_RSS_IPV6_EX | \ + ETH_RSS_IPV6_TCP_EX | \ + ETH_RSS_IPV6_UDP_EX | \ + ETH_RSS_PORT | \ + ETH_RSS_VXLAN | \ + ETH_RSS_GENEVE | \ + ETH_RSS_NVGRE) + +/* + * Definitions used for redirection table entry size. + * Some RSS RETA sizes may not be supported by some drivers, check the + * documentation or the description of relevant functions for more details. + */ +#define ETH_RSS_RETA_SIZE_64 64 +#define ETH_RSS_RETA_SIZE_128 128 +#define ETH_RSS_RETA_SIZE_256 256 +#define ETH_RSS_RETA_SIZE_512 512 +#define RTE_RETA_GROUP_SIZE 64 + +/* Definitions used for VMDQ and DCB functionality */ +#define ETH_VMDQ_MAX_VLAN_FILTERS 64 /**< Maximum nb. of VMDQ vlan filters. */ +#define ETH_DCB_NUM_USER_PRIORITIES 8 /**< Maximum nb. of DCB priorities. */ +#define ETH_VMDQ_DCB_NUM_QUEUES 128 /**< Maximum nb. of VMDQ DCB queues. */ +#define ETH_DCB_NUM_QUEUES 128 /**< Maximum nb. of DCB queues. */ + +/* DCB capability defines */ +#define ETH_DCB_PG_SUPPORT 0x00000001 /**< Priority Group(ETS) support. */ +#define ETH_DCB_PFC_SUPPORT 0x00000002 /**< Priority Flow Control support. */ + +/* Definitions used for VLAN Offload functionality */ +#define ETH_VLAN_STRIP_OFFLOAD 0x0001 /**< VLAN Strip On/Off */ +#define ETH_VLAN_FILTER_OFFLOAD 0x0002 /**< VLAN Filter On/Off */ +#define ETH_VLAN_EXTEND_OFFLOAD 0x0004 /**< VLAN Extend On/Off */ + +/* Definitions used for mask VLAN setting */ +#define ETH_VLAN_STRIP_MASK 0x0001 /**< VLAN Strip setting mask */ +#define ETH_VLAN_FILTER_MASK 0x0002 /**< VLAN Filter setting mask*/ +#define ETH_VLAN_EXTEND_MASK 0x0004 /**< VLAN Extend setting mask*/ +#define ETH_VLAN_ID_MAX 0x0FFF /**< VLAN ID is in lower 12 bits*/ + +/* Definitions used for receive MAC address */ +#define ETH_NUM_RECEIVE_MAC_ADDR 128 /**< Maximum nb. of receive mac addr. */ + +/* Definitions used for unicast hash */ +#define ETH_VMDQ_NUM_UC_HASH_ARRAY 128 /**< Maximum nb. of UC hash array. */ + +/* Definitions used for VMDQ pool rx mode setting */ +#define ETH_VMDQ_ACCEPT_UNTAG 0x0001 /**< accept untagged packets. */ +#define ETH_VMDQ_ACCEPT_HASH_MC 0x0002 /**< accept packets in multicast table . */ +#define ETH_VMDQ_ACCEPT_HASH_UC 0x0004 /**< accept packets in unicast table. */ +#define ETH_VMDQ_ACCEPT_BROADCAST 0x0008 /**< accept broadcast packets. */ +#define ETH_VMDQ_ACCEPT_MULTICAST 0x0010 /**< multicast promiscuous. */ + +/** Maximum nb. of vlan per mirror rule */ +#define ETH_MIRROR_MAX_VLANS 64 + +#define ETH_MIRROR_VIRTUAL_POOL_UP 0x01 /**< Virtual Pool uplink Mirroring. */ +#define ETH_MIRROR_UPLINK_PORT 0x02 /**< Uplink Port Mirroring. */ +#define ETH_MIRROR_DOWNLINK_PORT 0x04 /**< Downlink Port Mirroring. */ +#define ETH_MIRROR_VLAN 0x08 /**< VLAN Mirroring. */ +#define ETH_MIRROR_VIRTUAL_POOL_DOWN 0x10 /**< Virtual Pool downlink Mirroring. */ + +/** + * A structure used to configure VLAN traffic mirror of an Ethernet port. + */ +struct rte_eth_vlan_mirror { + uint64_t vlan_mask; /**< mask for valid VLAN ID. */ + /** VLAN ID list for vlan mirroring. */ + uint16_t vlan_id[ETH_MIRROR_MAX_VLANS]; +}; + +/** + * A structure used to configure traffic mirror of an Ethernet port. + */ +struct rte_eth_mirror_conf { + uint8_t rule_type; /**< Mirroring rule type */ + uint8_t dst_pool; /**< Destination pool for this mirror rule. */ + uint64_t pool_mask; /**< Bitmap of pool for pool mirroring */ + /** VLAN ID setting for VLAN mirroring. */ + struct rte_eth_vlan_mirror vlan; +}; + +/** + * A structure used to configure 64 entries of Redirection Table of the + * Receive Side Scaling (RSS) feature of an Ethernet port. To configure + * more than 64 entries supported by hardware, an array of this structure + * is needed. + */ +struct rte_eth_rss_reta_entry64 { + uint64_t mask; + /**< Mask bits indicate which entries need to be updated/queried. */ + uint16_t reta[RTE_RETA_GROUP_SIZE]; + /**< Group of 64 redirection table entries. */ +}; + +/** + * This enum indicates the possible number of traffic classes + * in DCB configurations + */ +enum rte_eth_nb_tcs { + ETH_4_TCS = 4, /**< 4 TCs with DCB. */ + ETH_8_TCS = 8 /**< 8 TCs with DCB. */ +}; + +/** + * This enum indicates the possible number of queue pools + * in VMDQ configurations. + */ +enum rte_eth_nb_pools { + ETH_8_POOLS = 8, /**< 8 VMDq pools. */ + ETH_16_POOLS = 16, /**< 16 VMDq pools. */ + ETH_32_POOLS = 32, /**< 32 VMDq pools. */ + ETH_64_POOLS = 64 /**< 64 VMDq pools. */ +}; + +/* This structure may be extended in future. */ +struct rte_eth_dcb_rx_conf { + enum rte_eth_nb_tcs nb_tcs; /**< Possible DCB TCs, 4 or 8 TCs */ + /** Traffic class each UP mapped to. */ + uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES]; +}; + +struct rte_eth_vmdq_dcb_tx_conf { + enum rte_eth_nb_pools nb_queue_pools; /**< With DCB, 16 or 32 pools. */ + /** Traffic class each UP mapped to. */ + uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES]; +}; + +struct rte_eth_dcb_tx_conf { + enum rte_eth_nb_tcs nb_tcs; /**< Possible DCB TCs, 4 or 8 TCs. */ + /** Traffic class each UP mapped to. */ + uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES]; +}; + +struct rte_eth_vmdq_tx_conf { + enum rte_eth_nb_pools nb_queue_pools; /**< VMDq mode, 64 pools. */ +}; + +/** + * A structure used to configure the VMDQ+DCB feature + * of an Ethernet port. + * + * Using this feature, packets are routed to a pool of queues, based + * on the vlan id in the vlan tag, and then to a specific queue within + * that pool, using the user priority vlan tag field. + * + * A default pool may be used, if desired, to route all traffic which + * does not match the vlan filter rules. + */ +struct rte_eth_vmdq_dcb_conf { + enum rte_eth_nb_pools nb_queue_pools; /**< With DCB, 16 or 32 pools */ + uint8_t enable_default_pool; /**< If non-zero, use a default pool */ + uint8_t default_pool; /**< The default pool, if applicable */ + uint8_t nb_pool_maps; /**< We can have up to 64 filters/mappings */ + struct { + uint16_t vlan_id; /**< The vlan id of the received frame */ + uint64_t pools; /**< Bitmask of pools for packet rx */ + } pool_map[ETH_VMDQ_MAX_VLAN_FILTERS]; /**< VMDq vlan pool maps. */ + uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES]; + /**< Selects a queue in a pool */ +}; + +/** + * A structure used to configure the VMDQ feature of an Ethernet port when + * not combined with the DCB feature. + * + * Using this feature, packets are routed to a pool of queues. By default, + * the pool selection is based on the MAC address, the vlan id in the + * vlan tag as specified in the pool_map array. + * Passing the ETH_VMDQ_ACCEPT_UNTAG in the rx_mode field allows pool + * selection using only the MAC address. MAC address to pool mapping is done + * using the rte_eth_dev_mac_addr_add function, with the pool parameter + * corresponding to the pool id. + * + * Queue selection within the selected pool will be done using RSS when + * it is enabled or revert to the first queue of the pool if not. + * + * A default pool may be used, if desired, to route all traffic which + * does not match the vlan filter rules or any pool MAC address. + */ +struct rte_eth_vmdq_rx_conf { + enum rte_eth_nb_pools nb_queue_pools; /**< VMDq only mode, 8 or 64 pools */ + uint8_t enable_default_pool; /**< If non-zero, use a default pool */ + uint8_t default_pool; /**< The default pool, if applicable */ + uint8_t enable_loop_back; /**< Enable VT loop back */ + uint8_t nb_pool_maps; /**< We can have up to 64 filters/mappings */ + uint32_t rx_mode; /**< Flags from ETH_VMDQ_ACCEPT_* */ + struct { + uint16_t vlan_id; /**< The vlan id of the received frame */ + uint64_t pools; /**< Bitmask of pools for packet rx */ + } pool_map[ETH_VMDQ_MAX_VLAN_FILTERS]; /**< VMDq vlan pool maps. */ +}; + +/** + * A structure used to configure the TX features of an Ethernet port. + */ +struct rte_eth_txmode { + enum rte_eth_tx_mq_mode mq_mode; /**< TX multi-queues mode. */ + /** + * Per-port Tx offloads to be set using DEV_TX_OFFLOAD_* flags. + * Only offloads set on tx_offload_capa field on rte_eth_dev_info + * structure are allowed to be set. + */ + uint64_t offloads; + + /* For i40e specifically */ + uint16_t pvid; + __extension__ + uint8_t hw_vlan_reject_tagged : 1, + /**< If set, reject sending out tagged pkts */ + hw_vlan_reject_untagged : 1, + /**< If set, reject sending out untagged pkts */ + hw_vlan_insert_pvid : 1; + /**< If set, enable port based VLAN insertion */ +}; + +/** + * A structure used to configure an RX ring of an Ethernet port. + */ +struct rte_eth_rxconf { + struct rte_eth_thresh rx_thresh; /**< RX ring threshold registers. */ + uint16_t rx_free_thresh; /**< Drives the freeing of RX descriptors. */ + uint8_t rx_drop_en; /**< Drop packets if no descriptors are available. */ + uint8_t rx_deferred_start; /**< Do not start queue with rte_eth_dev_start(). */ + /** + * Per-queue Rx offloads to be set using DEV_RX_OFFLOAD_* flags. + * Only offloads set on rx_queue_offload_capa or rx_offload_capa + * fields on rte_eth_dev_info structure are allowed to be set. + */ + uint64_t offloads; +}; + +/** + * A structure used to configure a TX ring of an Ethernet port. + */ +struct rte_eth_txconf { + struct rte_eth_thresh tx_thresh; /**< TX ring threshold registers. */ + uint16_t tx_rs_thresh; /**< Drives the setting of RS bit on TXDs. */ + uint16_t tx_free_thresh; /**< Start freeing TX buffers if there are + less free descriptors than this value. */ + + uint8_t tx_deferred_start; /**< Do not start queue with rte_eth_dev_start(). */ + /** + * Per-queue Tx offloads to be set using DEV_TX_OFFLOAD_* flags. + * Only offloads set on tx_queue_offload_capa or tx_offload_capa + * fields on rte_eth_dev_info structure are allowed to be set. + */ + uint64_t offloads; +}; + +/** + * A structure contains information about HW descriptor ring limitations. + */ +struct rte_eth_desc_lim { + uint16_t nb_max; /**< Max allowed number of descriptors. */ + uint16_t nb_min; /**< Min allowed number of descriptors. */ + uint16_t nb_align; /**< Number of descriptors should be aligned to. */ + + /** + * Max allowed number of segments per whole packet. + * + * - For TSO packet this is the total number of data descriptors allowed + * by device. + * + * @see nb_mtu_seg_max + */ + uint16_t nb_seg_max; + + /** + * Max number of segments per one MTU. + * + * - For non-TSO packet, this is the maximum allowed number of segments + * in a single transmit packet. + * + * - For TSO packet each segment within the TSO may span up to this + * value. + * + * @see nb_seg_max + */ + uint16_t nb_mtu_seg_max; +}; + +/** + * This enum indicates the flow control mode + */ +enum rte_eth_fc_mode { + RTE_FC_NONE = 0, /**< Disable flow control. */ + RTE_FC_RX_PAUSE, /**< RX pause frame, enable flowctrl on TX side. */ + RTE_FC_TX_PAUSE, /**< TX pause frame, enable flowctrl on RX side. */ + RTE_FC_FULL /**< Enable flow control on both side. */ +}; + +/** + * A structure used to configure Ethernet flow control parameter. + * These parameters will be configured into the register of the NIC. + * Please refer to the corresponding data sheet for proper value. + */ +struct rte_eth_fc_conf { + uint32_t high_water; /**< High threshold value to trigger XOFF */ + uint32_t low_water; /**< Low threshold value to trigger XON */ + uint16_t pause_time; /**< Pause quota in the Pause frame */ + uint16_t send_xon; /**< Is XON frame need be sent */ + enum rte_eth_fc_mode mode; /**< Link flow control mode */ + uint8_t mac_ctrl_frame_fwd; /**< Forward MAC control frames */ + uint8_t autoneg; /**< Use Pause autoneg */ +}; + +/** + * A structure used to configure Ethernet priority flow control parameter. + * These parameters will be configured into the register of the NIC. + * Please refer to the corresponding data sheet for proper value. + */ +struct rte_eth_pfc_conf { + struct rte_eth_fc_conf fc; /**< General flow control parameter. */ + uint8_t priority; /**< VLAN User Priority. */ +}; + +/** + * Memory space that can be configured to store Flow Director filters + * in the board memory. + */ +enum rte_fdir_pballoc_type { + RTE_FDIR_PBALLOC_64K = 0, /**< 64k. */ + RTE_FDIR_PBALLOC_128K, /**< 128k. */ + RTE_FDIR_PBALLOC_256K, /**< 256k. */ +}; + +/** + * Select report mode of FDIR hash information in RX descriptors. + */ +enum rte_fdir_status_mode { + RTE_FDIR_NO_REPORT_STATUS = 0, /**< Never report FDIR hash. */ + RTE_FDIR_REPORT_STATUS, /**< Only report FDIR hash for matching pkts. */ + RTE_FDIR_REPORT_STATUS_ALWAYS, /**< Always report FDIR hash. */ +}; + +/** + * A structure used to configure the Flow Director (FDIR) feature + * of an Ethernet port. + * + * If mode is RTE_FDIR_DISABLE, the pballoc value is ignored. + */ +struct rte_fdir_conf { + enum rte_fdir_mode mode; /**< Flow Director mode. */ + enum rte_fdir_pballoc_type pballoc; /**< Space for FDIR filters. */ + enum rte_fdir_status_mode status; /**< How to report FDIR hash. */ + /** RX queue of packets matching a "drop" filter in perfect mode. */ + uint8_t drop_queue; + struct rte_eth_fdir_masks mask; + struct rte_eth_fdir_flex_conf flex_conf; + /**< Flex payload configuration. */ +}; + +/** + * UDP tunneling configuration. + * Used to config the UDP port for a type of tunnel. + * NICs need the UDP port to identify the tunnel type. + * Normally a type of tunnel has a default UDP port, this structure can be used + * in case if the users want to change or support more UDP port. + */ +struct rte_eth_udp_tunnel { + uint16_t udp_port; /**< UDP port used for the tunnel. */ + uint8_t prot_type; /**< Tunnel type. Defined in rte_eth_tunnel_type. */ +}; + +/** + * A structure used to enable/disable specific device interrupts. + */ +struct rte_intr_conf { + /** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */ + uint32_t lsc:1; + /** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */ + uint32_t rxq:1; + /** enable/disable rmv interrupt. 0 (default) - disable, 1 enable */ + uint32_t rmv:1; +}; + +/** + * A structure used to configure an Ethernet port. + * Depending upon the RX multi-queue mode, extra advanced + * configuration settings may be needed. + */ +struct rte_eth_conf { + uint32_t link_speeds; /**< bitmap of ETH_LINK_SPEED_XXX of speeds to be + used. ETH_LINK_SPEED_FIXED disables link + autonegotiation, and a unique speed shall be + set. Otherwise, the bitmap defines the set of + speeds to be advertised. If the special value + ETH_LINK_SPEED_AUTONEG (0) is used, all speeds + supported are advertised. */ + struct rte_eth_rxmode rxmode; /**< Port RX configuration. */ + struct rte_eth_txmode txmode; /**< Port TX configuration. */ + uint32_t lpbk_mode; /**< Loopback operation mode. By default the value + is 0, meaning the loopback mode is disabled. + Read the datasheet of given ethernet controller + for details. The possible values of this field + are defined in implementation of each driver. */ + struct { + struct rte_eth_rss_conf rss_conf; /**< Port RSS configuration */ + struct rte_eth_vmdq_dcb_conf vmdq_dcb_conf; + /**< Port vmdq+dcb configuration. */ + struct rte_eth_dcb_rx_conf dcb_rx_conf; + /**< Port dcb RX configuration. */ + struct rte_eth_vmdq_rx_conf vmdq_rx_conf; + /**< Port vmdq RX configuration. */ + } rx_adv_conf; /**< Port RX filtering configuration. */ + union { + struct rte_eth_vmdq_dcb_tx_conf vmdq_dcb_tx_conf; + /**< Port vmdq+dcb TX configuration. */ + struct rte_eth_dcb_tx_conf dcb_tx_conf; + /**< Port dcb TX configuration. */ + struct rte_eth_vmdq_tx_conf vmdq_tx_conf; + /**< Port vmdq TX configuration. */ + } tx_adv_conf; /**< Port TX DCB configuration (union). */ + /** Currently,Priority Flow Control(PFC) are supported,if DCB with PFC + is needed,and the variable must be set ETH_DCB_PFC_SUPPORT. */ + uint32_t dcb_capability_en; + struct rte_fdir_conf fdir_conf; /**< FDIR configuration. */ + struct rte_intr_conf intr_conf; /**< Interrupt mode configuration. */ +}; + +/** + * A structure used to retrieve the contextual information of + * an Ethernet device, such as the controlling driver of the device, + * its PCI context, etc... + */ + +/** + * RX offload capabilities of a device. + */ +#define DEV_RX_OFFLOAD_VLAN_STRIP 0x00000001 +#define DEV_RX_OFFLOAD_IPV4_CKSUM 0x00000002 +#define DEV_RX_OFFLOAD_UDP_CKSUM 0x00000004 +#define DEV_RX_OFFLOAD_TCP_CKSUM 0x00000008 +#define DEV_RX_OFFLOAD_TCP_LRO 0x00000010 +#define DEV_RX_OFFLOAD_QINQ_STRIP 0x00000020 +#define DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM 0x00000040 +#define DEV_RX_OFFLOAD_MACSEC_STRIP 0x00000080 +#define DEV_RX_OFFLOAD_HEADER_SPLIT 0x00000100 +#define DEV_RX_OFFLOAD_VLAN_FILTER 0x00000200 +#define DEV_RX_OFFLOAD_VLAN_EXTEND 0x00000400 +#define DEV_RX_OFFLOAD_JUMBO_FRAME 0x00000800 +#define DEV_RX_OFFLOAD_CRC_STRIP 0x00001000 +#define DEV_RX_OFFLOAD_SCATTER 0x00002000 +#define DEV_RX_OFFLOAD_TIMESTAMP 0x00004000 +#define DEV_RX_OFFLOAD_SECURITY 0x00008000 + +/** + * Invalid to set both DEV_RX_OFFLOAD_CRC_STRIP and DEV_RX_OFFLOAD_KEEP_CRC + * No DEV_RX_OFFLOAD_CRC_STRIP flag means keep CRC + */ +#define DEV_RX_OFFLOAD_KEEP_CRC 0x00010000 +#define DEV_RX_OFFLOAD_CHECKSUM (DEV_RX_OFFLOAD_IPV4_CKSUM | \ + DEV_RX_OFFLOAD_UDP_CKSUM | \ + DEV_RX_OFFLOAD_TCP_CKSUM) +#define DEV_RX_OFFLOAD_VLAN (DEV_RX_OFFLOAD_VLAN_STRIP | \ + DEV_RX_OFFLOAD_VLAN_FILTER | \ + DEV_RX_OFFLOAD_VLAN_EXTEND) + +/* + * If new Rx offload capabilities are defined, they also must be + * mentioned in rte_rx_offload_names in rte_ethdev.c file. + */ + +/** + * TX offload capabilities of a device. + */ +#define DEV_TX_OFFLOAD_VLAN_INSERT 0x00000001 +#define DEV_TX_OFFLOAD_IPV4_CKSUM 0x00000002 +#define DEV_TX_OFFLOAD_UDP_CKSUM 0x00000004 +#define DEV_TX_OFFLOAD_TCP_CKSUM 0x00000008 +#define DEV_TX_OFFLOAD_SCTP_CKSUM 0x00000010 +#define DEV_TX_OFFLOAD_TCP_TSO 0x00000020 +#define DEV_TX_OFFLOAD_UDP_TSO 0x00000040 +#define DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM 0x00000080 /**< Used for tunneling packet. */ +#define DEV_TX_OFFLOAD_QINQ_INSERT 0x00000100 +#define DEV_TX_OFFLOAD_VXLAN_TNL_TSO 0x00000200 /**< Used for tunneling packet. */ +#define DEV_TX_OFFLOAD_GRE_TNL_TSO 0x00000400 /**< Used for tunneling packet. */ +#define DEV_TX_OFFLOAD_IPIP_TNL_TSO 0x00000800 /**< Used for tunneling packet. */ +#define DEV_TX_OFFLOAD_GENEVE_TNL_TSO 0x00001000 /**< Used for tunneling packet. */ +#define DEV_TX_OFFLOAD_MACSEC_INSERT 0x00002000 +#define DEV_TX_OFFLOAD_MT_LOCKFREE 0x00004000 +/**< Multiple threads can invoke rte_eth_tx_burst() concurrently on the same + * tx queue without SW lock. + */ +#define DEV_TX_OFFLOAD_MULTI_SEGS 0x00008000 +/**< Device supports multi segment send. */ +#define DEV_TX_OFFLOAD_MBUF_FAST_FREE 0x00010000 +/**< Device supports optimization for fast release of mbufs. + * When set application must guarantee that per-queue all mbufs comes from + * the same mempool and has refcnt = 1. + */ +#define DEV_TX_OFFLOAD_SECURITY 0x00020000 +/** + * Device supports generic UDP tunneled packet TSO. + * Application must set PKT_TX_TUNNEL_UDP and other mbuf fields required + * for tunnel TSO. + */ +#define DEV_TX_OFFLOAD_UDP_TNL_TSO 0x00040000 +/** + * Device supports generic IP tunneled packet TSO. + * Application must set PKT_TX_TUNNEL_IP and other mbuf fields required + * for tunnel TSO. + */ +#define DEV_TX_OFFLOAD_IP_TNL_TSO 0x00080000 + +#define RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP 0x00000001 +/**< Device supports Rx queue setup after device started*/ +#define RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP 0x00000002 +/**< Device supports Tx queue setup after device started*/ + +/* + * If new Tx offload capabilities are defined, they also must be + * mentioned in rte_tx_offload_names in rte_ethdev.c file. + */ + +/* + * Fallback default preferred Rx/Tx port parameters. + * These are used if an application requests default parameters + * but the PMD does not provide preferred values. + */ +#define RTE_ETH_DEV_FALLBACK_RX_RINGSIZE 512 +#define RTE_ETH_DEV_FALLBACK_TX_RINGSIZE 512 +#define RTE_ETH_DEV_FALLBACK_RX_NBQUEUES 1 +#define RTE_ETH_DEV_FALLBACK_TX_NBQUEUES 1 + +/** + * Preferred Rx/Tx port parameters. + * There are separate instances of this structure for transmission + * and reception respectively. + */ +struct rte_eth_dev_portconf { + uint16_t burst_size; /**< Device-preferred burst size */ + uint16_t ring_size; /**< Device-preferred size of queue rings */ + uint16_t nb_queues; /**< Device-preferred number of queues */ +}; + +/** + * Default values for switch domain id when ethdev does not support switch + * domain definitions. + */ +#define RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID (0) + +/** + * Ethernet device associated switch information + */ +struct rte_eth_switch_info { + const char *name; /**< switch name */ + uint16_t domain_id; /**< switch domain id */ + uint16_t port_id; + /**< + * mapping to the devices physical switch port as enumerated from the + * perspective of the embedded interconnect/switch. For SR-IOV enabled + * device this may correspond to the VF_ID of each virtual function, + * but each driver should explicitly define the mapping of switch + * port identifier to that physical interconnect/switch + */ +}; + +/** + * Ethernet device information + */ +struct rte_eth_dev_info { + struct rte_device *device; /** Generic device information */ + const char *driver_name; /**< Device Driver name. */ + unsigned int if_index; /**< Index to bound host interface, or 0 if none. + Use if_indextoname() to translate into an interface name. */ + const uint32_t *dev_flags; /**< Device flags */ + uint32_t min_rx_bufsize; /**< Minimum size of RX buffer. */ + uint32_t max_rx_pktlen; /**< Maximum configurable length of RX pkt. */ + uint16_t max_rx_queues; /**< Maximum number of RX queues. */ + uint16_t max_tx_queues; /**< Maximum number of TX queues. */ + uint32_t max_mac_addrs; /**< Maximum number of MAC addresses. */ + uint32_t max_hash_mac_addrs; + /** Maximum number of hash MAC addresses for MTA and UTA. */ + uint16_t max_vfs; /**< Maximum number of VFs. */ + uint16_t max_vmdq_pools; /**< Maximum number of VMDq pools. */ + uint64_t rx_offload_capa; + /**< All RX offload capabilities including all per-queue ones */ + uint64_t tx_offload_capa; + /**< All TX offload capabilities including all per-queue ones */ + uint64_t rx_queue_offload_capa; + /**< Device per-queue RX offload capabilities. */ + uint64_t tx_queue_offload_capa; + /**< Device per-queue TX offload capabilities. */ + uint16_t reta_size; + /**< Device redirection table size, the total number of entries. */ + uint8_t hash_key_size; /**< Hash key size in bytes */ + /** Bit mask of RSS offloads, the bit offset also means flow type */ + uint64_t flow_type_rss_offloads; + struct rte_eth_rxconf default_rxconf; /**< Default RX configuration */ + struct rte_eth_txconf default_txconf; /**< Default TX configuration */ + uint16_t vmdq_queue_base; /**< First queue ID for VMDQ pools. */ + uint16_t vmdq_queue_num; /**< Queue number for VMDQ pools. */ + uint16_t vmdq_pool_base; /**< First ID of VMDQ pools. */ + struct rte_eth_desc_lim rx_desc_lim; /**< RX descriptors limits */ + struct rte_eth_desc_lim tx_desc_lim; /**< TX descriptors limits */ + uint32_t speed_capa; /**< Supported speeds bitmap (ETH_LINK_SPEED_). */ + /** Configured number of rx/tx queues */ + uint16_t nb_rx_queues; /**< Number of RX queues. */ + uint16_t nb_tx_queues; /**< Number of TX queues. */ + /** Rx parameter recommendations */ + struct rte_eth_dev_portconf default_rxportconf; + /** Tx parameter recommendations */ + struct rte_eth_dev_portconf default_txportconf; + /** Generic device capabilities (RTE_ETH_DEV_CAPA_). */ + uint64_t dev_capa; + /** + * Switching information for ports on a device with a + * embedded managed interconnect/switch. + */ + struct rte_eth_switch_info switch_info; +}; + +/** + * Ethernet device RX queue information structure. + * Used to retieve information about configured queue. + */ +struct rte_eth_rxq_info { + struct rte_mempool *mp; /**< mempool used by that queue. */ + struct rte_eth_rxconf conf; /**< queue config parameters. */ + uint8_t scattered_rx; /**< scattered packets RX supported. */ + uint16_t nb_desc; /**< configured number of RXDs. */ +} __rte_cache_min_aligned; + +/** + * Ethernet device TX queue information structure. + * Used to retrieve information about configured queue. + */ +struct rte_eth_txq_info { + struct rte_eth_txconf conf; /**< queue config parameters. */ + uint16_t nb_desc; /**< configured number of TXDs. */ +} __rte_cache_min_aligned; + +/** Maximum name length for extended statistics counters */ +#define RTE_ETH_XSTATS_NAME_SIZE 64 + +/** + * An Ethernet device extended statistic structure + * + * This structure is used by rte_eth_xstats_get() to provide + * statistics that are not provided in the generic *rte_eth_stats* + * structure. + * It maps a name id, corresponding to an index in the array returned + * by rte_eth_xstats_get_names(), to a statistic value. + */ +struct rte_eth_xstat { + uint64_t id; /**< The index in xstats name array. */ + uint64_t value; /**< The statistic counter value. */ +}; + +/** + * A name element for extended statistics. + * + * An array of this structure is returned by rte_eth_xstats_get_names(). + * It lists the names of extended statistics for a PMD. The *rte_eth_xstat* + * structure references these names by their array index. + */ +struct rte_eth_xstat_name { + char name[RTE_ETH_XSTATS_NAME_SIZE]; /**< The statistic name. */ +}; + +#define ETH_DCB_NUM_TCS 8 +#define ETH_MAX_VMDQ_POOL 64 + +/** + * A structure used to get the information of queue and + * TC mapping on both TX and RX paths. + */ +struct rte_eth_dcb_tc_queue_mapping { + /** rx queues assigned to tc per Pool */ + struct { + uint8_t base; + uint8_t nb_queue; + } tc_rxq[ETH_MAX_VMDQ_POOL][ETH_DCB_NUM_TCS]; + /** rx queues assigned to tc per Pool */ + struct { + uint8_t base; + uint8_t nb_queue; + } tc_txq[ETH_MAX_VMDQ_POOL][ETH_DCB_NUM_TCS]; +}; + +/** + * A structure used to get the information of DCB. + * It includes TC UP mapping and queue TC mapping. + */ +struct rte_eth_dcb_info { + uint8_t nb_tcs; /**< number of TCs */ + uint8_t prio_tc[ETH_DCB_NUM_USER_PRIORITIES]; /**< Priority to tc */ + uint8_t tc_bws[ETH_DCB_NUM_TCS]; /**< TX BW percentage for each TC */ + /** rx queues assigned to tc */ + struct rte_eth_dcb_tc_queue_mapping tc_queue; +}; + +/** + * RX/TX queue states + */ +#define RTE_ETH_QUEUE_STATE_STOPPED 0 +#define RTE_ETH_QUEUE_STATE_STARTED 1 + +#define RTE_ETH_ALL RTE_MAX_ETHPORTS + +/* Macros to check for valid port */ +#define RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, retval) do { \ + if (!rte_eth_dev_is_valid_port(port_id)) { \ + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%u\n", port_id); \ + return retval; \ + } \ +} while (0) + +#define RTE_ETH_VALID_PORTID_OR_RET(port_id) do { \ + if (!rte_eth_dev_is_valid_port(port_id)) { \ + RTE_ETHDEV_LOG(ERR, "Invalid port_id=%u\n", port_id); \ + return; \ + } \ +} while (0) + +/** + * l2 tunnel configuration. + */ + +/**< l2 tunnel enable mask */ +#define ETH_L2_TUNNEL_ENABLE_MASK 0x00000001 +/**< l2 tunnel insertion mask */ +#define ETH_L2_TUNNEL_INSERTION_MASK 0x00000002 +/**< l2 tunnel stripping mask */ +#define ETH_L2_TUNNEL_STRIPPING_MASK 0x00000004 +/**< l2 tunnel forwarding mask */ +#define ETH_L2_TUNNEL_FORWARDING_MASK 0x00000008 + +/** + * Function type used for RX packet processing packet callbacks. + * + * The callback function is called on RX with a burst of packets that have + * been received on the given port and queue. + * + * @param port_id + * The Ethernet port on which RX is being performed. + * @param queue + * The queue on the Ethernet port which is being used to receive the packets. + * @param pkts + * The burst of packets that have just been received. + * @param nb_pkts + * The number of packets in the burst pointed to by "pkts". + * @param max_pkts + * The max number of packets that can be stored in the "pkts" array. + * @param user_param + * The arbitrary user parameter passed in by the application when the callback + * was originally configured. + * @return + * The number of packets returned to the user. + */ +typedef uint16_t (*rte_rx_callback_fn)(uint16_t port_id, uint16_t queue, + struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t max_pkts, + void *user_param); + +/** + * Function type used for TX packet processing packet callbacks. + * + * The callback function is called on TX with a burst of packets immediately + * before the packets are put onto the hardware queue for transmission. + * + * @param port_id + * The Ethernet port on which TX is being performed. + * @param queue + * The queue on the Ethernet port which is being used to transmit the packets. + * @param pkts + * The burst of packets that are about to be transmitted. + * @param nb_pkts + * The number of packets in the burst pointed to by "pkts". + * @param user_param + * The arbitrary user parameter passed in by the application when the callback + * was originally configured. + * @return + * The number of packets to be written to the NIC. + */ +typedef uint16_t (*rte_tx_callback_fn)(uint16_t port_id, uint16_t queue, + struct rte_mbuf *pkts[], uint16_t nb_pkts, void *user_param); + +/** + * Possible states of an ethdev port. + */ +enum rte_eth_dev_state { + /** Device is unused before being probed. */ + RTE_ETH_DEV_UNUSED = 0, + /** Device is attached when allocated in probing. */ + RTE_ETH_DEV_ATTACHED, + /** The deferred state is useless and replaced by ownership. */ + RTE_ETH_DEV_DEFERRED, + /** Device is in removed state when plug-out is detected. */ + RTE_ETH_DEV_REMOVED, +}; + +struct rte_eth_dev_sriov { + uint8_t active; /**< SRIOV is active with 16, 32 or 64 pools */ + uint8_t nb_q_per_pool; /**< rx queue number per pool */ + uint16_t def_vmdq_idx; /**< Default pool num used for PF */ + uint16_t def_pool_q_idx; /**< Default pool queue start reg index */ +}; +#define RTE_ETH_DEV_SRIOV(dev) ((dev)->data->sriov) + +#define RTE_ETH_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN + +#define RTE_ETH_DEV_NO_OWNER 0 + +#define RTE_ETH_MAX_OWNER_NAME_LEN 64 + +struct rte_eth_dev_owner { + uint64_t id; /**< The owner unique identifier. */ + char name[RTE_ETH_MAX_OWNER_NAME_LEN]; /**< The owner name. */ +}; + +/** Device supports link state interrupt */ +#define RTE_ETH_DEV_INTR_LSC 0x0002 +/** Device is a bonded slave */ +#define RTE_ETH_DEV_BONDED_SLAVE 0x0004 +/** Device supports device removal interrupt */ +#define RTE_ETH_DEV_INTR_RMV 0x0008 +/** Device is port representor */ +#define RTE_ETH_DEV_REPRESENTOR 0x0010 + +/** + * Iterates over valid ethdev ports owned by a specific owner. + * + * @param port_id + * The id of the next possible valid owned port. + * @param owner_id + * The owner identifier. + * RTE_ETH_DEV_NO_OWNER means iterate over all valid ownerless ports. + * @return + * Next valid port id owned by owner_id, RTE_MAX_ETHPORTS if there is none. + */ +uint64_t rte_eth_find_next_owned_by(uint16_t port_id, + const uint64_t owner_id); + +/** + * Macro to iterate over all enabled ethdev ports owned by a specific owner. + */ +#define RTE_ETH_FOREACH_DEV_OWNED_BY(p, o) \ + for (p = rte_eth_find_next_owned_by(0, o); \ + (unsigned int)p < (unsigned int)RTE_MAX_ETHPORTS; \ + p = rte_eth_find_next_owned_by(p + 1, o)) + +/** + * Iterates over valid ethdev ports. + * + * @param port_id + * The id of the next possible valid port. + * @return + * Next valid port id, RTE_MAX_ETHPORTS if there is none. + */ +uint16_t rte_eth_find_next(uint16_t port_id); + +/** + * Macro to iterate over all enabled and ownerless ethdev ports. + */ +#define RTE_ETH_FOREACH_DEV(p) \ + RTE_ETH_FOREACH_DEV_OWNED_BY(p, RTE_ETH_DEV_NO_OWNER) + + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Get a new unique owner identifier. + * An owner identifier is used to owns Ethernet devices by only one DPDK entity + * to avoid multiple management of device by different entities. + * + * @param owner_id + * Owner identifier pointer. + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental rte_eth_dev_owner_new(uint64_t *owner_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Set an Ethernet device owner. + * + * @param port_id + * The identifier of the port to own. + * @param owner + * The owner pointer. + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental rte_eth_dev_owner_set(const uint16_t port_id, + const struct rte_eth_dev_owner *owner); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Unset Ethernet device owner to make the device ownerless. + * + * @param port_id + * The identifier of port to make ownerless. + * @param owner_id + * The owner identifier. + * @return + * 0 on success, negative errno value on error. + */ +int __rte_experimental rte_eth_dev_owner_unset(const uint16_t port_id, + const uint64_t owner_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Remove owner from all Ethernet devices owned by a specific owner. + * + * @param owner_id + * The owner identifier. + */ +void __rte_experimental rte_eth_dev_owner_delete(const uint64_t owner_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Get the owner of an Ethernet device. + * + * @param port_id + * The port identifier. + * @param owner + * The owner structure pointer to fill. + * @return + * 0 on success, negative errno value on error.. + */ +int __rte_experimental rte_eth_dev_owner_get(const uint16_t port_id, + struct rte_eth_dev_owner *owner); + +/** + * Get the total number of Ethernet devices that have been successfully + * initialized by the matching Ethernet driver during the PCI probing phase + * and that are available for applications to use. These devices must be + * accessed by using the ``RTE_ETH_FOREACH_DEV()`` macro to deal with + * non-contiguous ranges of devices. + * These non-contiguous ranges can be created by calls to hotplug functions or + * by some PMDs. + * + * @return + * - The total number of usable Ethernet devices. + */ +__rte_deprecated +uint16_t rte_eth_dev_count(void); + +/** + * Get the number of ports which are usable for the application. + * + * These devices must be iterated by using the macro + * ``RTE_ETH_FOREACH_DEV`` or ``RTE_ETH_FOREACH_DEV_OWNED_BY`` + * to deal with non-contiguous ranges of devices. + * + * @return + * The count of available Ethernet devices. + */ +uint16_t rte_eth_dev_count_avail(void); + +/** + * Get the total number of ports which are allocated. + * + * Some devices may not be available for the application. + * + * @return + * The total count of Ethernet devices. + */ +uint16_t __rte_experimental rte_eth_dev_count_total(void); + +/** + * Attach a new Ethernet device specified by arguments. + * + * @param devargs + * A pointer to a strings array describing the new device + * to be attached. The strings should be a pci address like + * '0000:01:00.0' or virtual device name like 'net_pcap0'. + * @param port_id + * A pointer to a port identifier actually attached. + * @return + * 0 on success and port_id is filled, negative on error + */ +__rte_deprecated +int rte_eth_dev_attach(const char *devargs, uint16_t *port_id); + +/** + * Detach a Ethernet device specified by port identifier. + * This function must be called when the device is in the + * closed state. + * + * @param port_id + * The port identifier of the device to detach. + * @param devname + * A pointer to a buffer that will be filled with the device name. + * This buffer must be at least RTE_DEV_NAME_MAX_LEN long. + * @return + * 0 on success and devname is filled, negative on error + */ +__rte_deprecated +int rte_eth_dev_detach(uint16_t port_id, char *devname); + +/** + * Convert a numerical speed in Mbps to a bitmap flag that can be used in + * the bitmap link_speeds of the struct rte_eth_conf + * + * @param speed + * Numerical speed value in Mbps + * @param duplex + * ETH_LINK_[HALF/FULL]_DUPLEX (only for 10/100M speeds) + * @return + * 0 if the speed cannot be mapped + */ +uint32_t rte_eth_speed_bitflag(uint32_t speed, int duplex); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Get DEV_RX_OFFLOAD_* flag name. + * + * @param offload + * Offload flag. + * @return + * Offload name or 'UNKNOWN' if the flag cannot be recognised. + */ +const char * __rte_experimental rte_eth_dev_rx_offload_name(uint64_t offload); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Get DEV_TX_OFFLOAD_* flag name. + * + * @param offload + * Offload flag. + * @return + * Offload name or 'UNKNOWN' if the flag cannot be recognised. + */ +const char * __rte_experimental rte_eth_dev_tx_offload_name(uint64_t offload); + +/** + * Configure an Ethernet device. + * This function must be invoked first before any other function in the + * Ethernet API. This function can also be re-invoked when a device is in the + * stopped state. + * + * @param port_id + * The port identifier of the Ethernet device to configure. + * @param nb_rx_queue + * The number of receive queues to set up for the Ethernet device. + * @param nb_tx_queue + * The number of transmit queues to set up for the Ethernet device. + * @param eth_conf + * The pointer to the configuration data to be used for the Ethernet device. + * The *rte_eth_conf* structure includes: + * - the hardware offload features to activate, with dedicated fields for + * each statically configurable offload hardware feature provided by + * Ethernet devices, such as IP checksum or VLAN tag stripping for + * example. + * The Rx offload bitfield API is obsolete and will be deprecated. + * Applications should set the ignore_bitfield_offloads bit on *rxmode* + * structure and use offloads field to set per-port offloads instead. + * - Any offloading set in eth_conf->[rt]xmode.offloads must be within + * the [rt]x_offload_capa returned from rte_eth_dev_infos_get(). + * Any type of device supported offloading set in the input argument + * eth_conf->[rt]xmode.offloads to rte_eth_dev_configure() is enabled + * on all queues and it can't be disabled in rte_eth_[rt]x_queue_setup() + * - the Receive Side Scaling (RSS) configuration when using multiple RX + * queues per port. Any RSS hash function set in eth_conf->rss_conf.rss_hf + * must be within the flow_type_rss_offloads provided by drivers via + * rte_eth_dev_infos_get() API. + * + * Embedding all configuration information in a single data structure + * is the more flexible method that allows the addition of new features + * without changing the syntax of the API. + * @return + * - 0: Success, device configured. + * - <0: Error code returned by the driver configuration function. + */ +int rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_queue, + uint16_t nb_tx_queue, const struct rte_eth_conf *eth_conf); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Check if an Ethernet device was physically removed. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * 1 when the Ethernet device is removed, otherwise 0. + */ +int __rte_experimental +rte_eth_dev_is_removed(uint16_t port_id); + +/** + * Allocate and set up a receive queue for an Ethernet device. + * + * The function allocates a contiguous block of memory for *nb_rx_desc* + * receive descriptors from a memory zone associated with *socket_id* + * and initializes each receive descriptor with a network buffer allocated + * from the memory pool *mb_pool*. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param rx_queue_id + * The index of the receive queue to set up. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param nb_rx_desc + * The number of receive descriptors to allocate for the receive ring. + * @param socket_id + * The *socket_id* argument is the socket identifier in case of NUMA. + * The value can be *SOCKET_ID_ANY* if there is no NUMA constraint for + * the DMA memory allocated for the receive descriptors of the ring. + * @param rx_conf + * The pointer to the configuration data to be used for the receive queue. + * NULL value is allowed, in which case default RX configuration + * will be used. + * The *rx_conf* structure contains an *rx_thresh* structure with the values + * of the Prefetch, Host, and Write-Back threshold registers of the receive + * ring. + * In addition it contains the hardware offloads features to activate using + * the DEV_RX_OFFLOAD_* flags. + * If an offloading set in rx_conf->offloads + * hasn't been set in the input argument eth_conf->rxmode.offloads + * to rte_eth_dev_configure(), it is a new added offloading, it must be + * per-queue type and it is enabled for the queue. + * No need to repeat any bit in rx_conf->offloads which has already been + * enabled in rte_eth_dev_configure() at port level. An offloading enabled + * at port level can't be disabled at queue level. + * @param mb_pool + * The pointer to the memory pool from which to allocate *rte_mbuf* network + * memory buffers to populate each descriptor of the receive ring. + * @return + * - 0: Success, receive queue correctly set up. + * - -EIO: if device is removed. + * - -EINVAL: The size of network buffers which can be allocated from the + * memory pool does not fit the various buffer sizes allowed by the + * device controller. + * - -ENOMEM: Unable to allocate the receive ring descriptors or to + * allocate network memory buffers from the memory pool when + * initializing receive descriptors. + */ +int rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, + uint16_t nb_rx_desc, unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mb_pool); + +/** + * Allocate and set up a transmit queue for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param tx_queue_id + * The index of the transmit queue to set up. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param nb_tx_desc + * The number of transmit descriptors to allocate for the transmit ring. + * @param socket_id + * The *socket_id* argument is the socket identifier in case of NUMA. + * Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for + * the DMA memory allocated for the transmit descriptors of the ring. + * @param tx_conf + * The pointer to the configuration data to be used for the transmit queue. + * NULL value is allowed, in which case default TX configuration + * will be used. + * The *tx_conf* structure contains the following data: + * - The *tx_thresh* structure with the values of the Prefetch, Host, and + * Write-Back threshold registers of the transmit ring. + * When setting Write-Back threshold to the value greater then zero, + * *tx_rs_thresh* value should be explicitly set to one. + * - The *tx_free_thresh* value indicates the [minimum] number of network + * buffers that must be pending in the transmit ring to trigger their + * [implicit] freeing by the driver transmit function. + * - The *tx_rs_thresh* value indicates the [minimum] number of transmit + * descriptors that must be pending in the transmit ring before setting the + * RS bit on a descriptor by the driver transmit function. + * The *tx_rs_thresh* value should be less or equal then + * *tx_free_thresh* value, and both of them should be less then + * *nb_tx_desc* - 3. + * - The *offloads* member contains Tx offloads to be enabled. + * If an offloading set in tx_conf->offloads + * hasn't been set in the input argument eth_conf->txmode.offloads + * to rte_eth_dev_configure(), it is a new added offloading, it must be + * per-queue type and it is enabled for the queue. + * No need to repeat any bit in tx_conf->offloads which has already been + * enabled in rte_eth_dev_configure() at port level. An offloading enabled + * at port level can't be disabled at queue level. + * + * Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0 forces + * the transmit function to use default values. + * @return + * - 0: Success, the transmit queue is correctly set up. + * - -ENOMEM: Unable to allocate the transmit ring descriptors. + */ +int rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, + uint16_t nb_tx_desc, unsigned int socket_id, + const struct rte_eth_txconf *tx_conf); + +/** + * Return the NUMA socket to which an Ethernet device is connected + * + * @param port_id + * The port identifier of the Ethernet device + * @return + * The NUMA socket id to which the Ethernet device is connected or + * a default of zero if the socket could not be determined. + * -1 is returned is the port_id value is out of range. + */ +int rte_eth_dev_socket_id(uint16_t port_id); + +/** + * Check if port_id of device is attached + * + * @param port_id + * The port identifier of the Ethernet device + * @return + * - 0 if port is out of range or not attached + * - 1 if device is attached + */ +int rte_eth_dev_is_valid_port(uint16_t port_id); + +/** + * Start specified RX queue of a port. It is used when rx_deferred_start + * flag of the specified queue is true. + * + * @param port_id + * The port identifier of the Ethernet device + * @param rx_queue_id + * The index of the rx queue to update the ring. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - 0: Success, the receive queue is started. + * - -EINVAL: The port_id or the queue_id out of range. + * - -EIO: if device is removed. + * - -ENOTSUP: The function not supported in PMD driver. + */ +int rte_eth_dev_rx_queue_start(uint16_t port_id, uint16_t rx_queue_id); + +/** + * Stop specified RX queue of a port + * + * @param port_id + * The port identifier of the Ethernet device + * @param rx_queue_id + * The index of the rx queue to update the ring. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - 0: Success, the receive queue is stopped. + * - -EINVAL: The port_id or the queue_id out of range. + * - -EIO: if device is removed. + * - -ENOTSUP: The function not supported in PMD driver. + */ +int rte_eth_dev_rx_queue_stop(uint16_t port_id, uint16_t rx_queue_id); + +/** + * Start TX for specified queue of a port. It is used when tx_deferred_start + * flag of the specified queue is true. + * + * @param port_id + * The port identifier of the Ethernet device + * @param tx_queue_id + * The index of the tx queue to update the ring. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - 0: Success, the transmit queue is started. + * - -EINVAL: The port_id or the queue_id out of range. + * - -EIO: if device is removed. + * - -ENOTSUP: The function not supported in PMD driver. + */ +int rte_eth_dev_tx_queue_start(uint16_t port_id, uint16_t tx_queue_id); + +/** + * Stop specified TX queue of a port + * + * @param port_id + * The port identifier of the Ethernet device + * @param tx_queue_id + * The index of the tx queue to update the ring. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - 0: Success, the transmit queue is stopped. + * - -EINVAL: The port_id or the queue_id out of range. + * - -EIO: if device is removed. + * - -ENOTSUP: The function not supported in PMD driver. + */ +int rte_eth_dev_tx_queue_stop(uint16_t port_id, uint16_t tx_queue_id); + +/** + * Start an Ethernet device. + * + * The device start step is the last one and consists of setting the configured + * offload features and in starting the transmit and the receive units of the + * device. + * On success, all basic functions exported by the Ethernet API (link status, + * receive/transmit, and so on) can be invoked. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - 0: Success, Ethernet device started. + * - <0: Error code of the driver device start function. + */ +int rte_eth_dev_start(uint16_t port_id); + +/** + * Stop an Ethernet device. The device can be restarted with a call to + * rte_eth_dev_start() + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_dev_stop(uint16_t port_id); + +/** + * Link up an Ethernet device. + * + * Set device link up will re-enable the device rx/tx + * functionality after it is previously set device linked down. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - 0: Success, Ethernet device linked up. + * - <0: Error code of the driver device link up function. + */ +int rte_eth_dev_set_link_up(uint16_t port_id); + +/** + * Link down an Ethernet device. + * The device rx/tx functionality will be disabled if success, + * and it can be re-enabled with a call to + * rte_eth_dev_set_link_up() + * + * @param port_id + * The port identifier of the Ethernet device. + */ +int rte_eth_dev_set_link_down(uint16_t port_id); + +/** + * Close a stopped Ethernet device. The device cannot be restarted! + * The function frees all resources except for needed by the + * closed state. To free these resources, call rte_eth_dev_detach(). + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_dev_close(uint16_t port_id); + +/** + * Reset a Ethernet device and keep its port id. + * + * When a port has to be reset passively, the DPDK application can invoke + * this function. For example when a PF is reset, all its VFs should also + * be reset. Normally a DPDK application can invoke this function when + * RTE_ETH_EVENT_INTR_RESET event is detected, but can also use it to start + * a port reset in other circumstances. + * + * When this function is called, it first stops the port and then calls the + * PMD specific dev_uninit( ) and dev_init( ) to return the port to initial + * state, in which no Tx and Rx queues are setup, as if the port has been + * reset and not started. The port keeps the port id it had before the + * function call. + * + * After calling rte_eth_dev_reset( ), the application should use + * rte_eth_dev_configure( ), rte_eth_rx_queue_setup( ), + * rte_eth_tx_queue_setup( ), and rte_eth_dev_start( ) + * to reconfigure the device as appropriate. + * + * Note: To avoid unexpected behavior, the application should stop calling + * Tx and Rx functions before calling rte_eth_dev_reset( ). For thread + * safety, all these controlling functions should be called from the same + * thread. + * + * @param port_id + * The port identifier of the Ethernet device. + * + * @return + * - (0) if successful. + * - (-EINVAL) if port identifier is invalid. + * - (-ENOTSUP) if hardware doesn't support this function. + * - (-EPERM) if not ran from the primary process. + * - (-EIO) if re-initialisation failed or device is removed. + * - (-ENOMEM) if the reset failed due to OOM. + * - (-EAGAIN) if the reset temporarily failed and should be retried later. + */ +int rte_eth_dev_reset(uint16_t port_id); + +/** + * Enable receipt in promiscuous mode for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_promiscuous_enable(uint16_t port_id); + +/** + * Disable receipt in promiscuous mode for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_promiscuous_disable(uint16_t port_id); + +/** + * Return the value of promiscuous mode for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (1) if promiscuous is enabled + * - (0) if promiscuous is disabled. + * - (-1) on error + */ +int rte_eth_promiscuous_get(uint16_t port_id); + +/** + * Enable the receipt of any multicast frame by an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_allmulticast_enable(uint16_t port_id); + +/** + * Disable the receipt of all multicast frames by an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_allmulticast_disable(uint16_t port_id); + +/** + * Return the value of allmulticast mode for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (1) if allmulticast is enabled + * - (0) if allmulticast is disabled. + * - (-1) on error + */ +int rte_eth_allmulticast_get(uint16_t port_id); + +/** + * Retrieve the status (ON/OFF), the speed (in Mbps) and the mode (HALF-DUPLEX + * or FULL-DUPLEX) of the physical link of an Ethernet device. It might need + * to wait up to 9 seconds in it. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param link + * A pointer to an *rte_eth_link* structure to be filled with + * the status, the speed and the mode of the Ethernet device link. + */ +void rte_eth_link_get(uint16_t port_id, struct rte_eth_link *link); + +/** + * Retrieve the status (ON/OFF), the speed (in Mbps) and the mode (HALF-DUPLEX + * or FULL-DUPLEX) of the physical link of an Ethernet device. It is a no-wait + * version of rte_eth_link_get(). + * + * @param port_id + * The port identifier of the Ethernet device. + * @param link + * A pointer to an *rte_eth_link* structure to be filled with + * the status, the speed and the mode of the Ethernet device link. + */ +void rte_eth_link_get_nowait(uint16_t port_id, struct rte_eth_link *link); + +/** + * Retrieve the general I/O statistics of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param stats + * A pointer to a structure of type *rte_eth_stats* to be filled with + * the values of device counters for the following set of statistics: + * - *ipackets* with the total of successfully received packets. + * - *opackets* with the total of successfully transmitted packets. + * - *ibytes* with the total of successfully received bytes. + * - *obytes* with the total of successfully transmitted bytes. + * - *ierrors* with the total of erroneous received packets. + * - *oerrors* with the total of failed transmitted packets. + * @return + * Zero if successful. Non-zero otherwise. + */ +int rte_eth_stats_get(uint16_t port_id, struct rte_eth_stats *stats); + +/** + * Reset the general I/O statistics of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (0) if device notified to reset stats. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + */ +int rte_eth_stats_reset(uint16_t port_id); + +/** + * Retrieve names of extended statistics of an Ethernet device. + * + * There is an assumption that 'xstat_names' and 'xstats' arrays are matched + * by array index: + * xstats_names[i].name => xstats[i].value + * + * And the array index is same with id field of 'struct rte_eth_xstat': + * xstats[i].id == i + * + * This assumption makes key-value pair matching less flexible but simpler. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param xstats_names + * An rte_eth_xstat_name array of at least *size* elements to + * be filled. If set to NULL, the function returns the required number + * of elements. + * @param size + * The size of the xstats_names array (number of elements). + * @return + * - A positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - A positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - A negative value on error (invalid port id). + */ +int rte_eth_xstats_get_names(uint16_t port_id, + struct rte_eth_xstat_name *xstats_names, + unsigned int size); + +/** + * Retrieve extended statistics of an Ethernet device. + * + * There is an assumption that 'xstat_names' and 'xstats' arrays are matched + * by array index: + * xstats_names[i].name => xstats[i].value + * + * And the array index is same with id field of 'struct rte_eth_xstat': + * xstats[i].id == i + * + * This assumption makes key-value pair matching less flexible but simpler. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param xstats + * A pointer to a table of structure of type *rte_eth_xstat* + * to be filled with device statistics ids and values. + * This parameter can be set to NULL if n is 0. + * @param n + * The size of the xstats array (number of elements). + * @return + * - A positive value lower or equal to n: success. The return value + * is the number of entries filled in the stats table. + * - A positive value higher than n: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - A negative value on error (invalid port id). + */ +int rte_eth_xstats_get(uint16_t port_id, struct rte_eth_xstat *xstats, + unsigned int n); + +/** + * Retrieve names of extended statistics of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param xstats_names + * An rte_eth_xstat_name array of at least *size* elements to + * be filled. If set to NULL, the function returns the required number + * of elements. + * @param ids + * IDs array given by app to retrieve specific statistics + * @param size + * The size of the xstats_names array (number of elements). + * @return + * - A positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - A positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - A negative value on error (invalid port id). + */ +int +rte_eth_xstats_get_names_by_id(uint16_t port_id, + struct rte_eth_xstat_name *xstats_names, unsigned int size, + uint64_t *ids); + +/** + * Retrieve extended statistics of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param ids + * A pointer to an ids array passed by application. This tells which + * statistics values function should retrieve. This parameter + * can be set to NULL if size is 0. In this case function will retrieve + * all avalible statistics. + * @param values + * A pointer to a table to be filled with device statistics values. + * @param size + * The size of the ids array (number of elements). + * @return + * - A positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - A positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - A negative value on error (invalid port id). + */ +int rte_eth_xstats_get_by_id(uint16_t port_id, const uint64_t *ids, + uint64_t *values, unsigned int size); + +/** + * Gets the ID of a statistic from its name. + * + * This function searches for the statistics using string compares, and + * as such should not be used on the fast-path. For fast-path retrieval of + * specific statistics, store the ID as provided in *id* from this function, + * and pass the ID to rte_eth_xstats_get() + * + * @param port_id The port to look up statistics from + * @param xstat_name The name of the statistic to return + * @param[out] id A pointer to an app-supplied uint64_t which should be + * set to the ID of the stat if the stat exists. + * @return + * 0 on success + * -ENODEV for invalid port_id, + * -EIO if device is removed, + * -EINVAL if the xstat_name doesn't exist in port_id + */ +int rte_eth_xstats_get_id_by_name(uint16_t port_id, const char *xstat_name, + uint64_t *id); + +/** + * Reset extended statistics of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + */ +void rte_eth_xstats_reset(uint16_t port_id); + +/** + * Set a mapping for the specified transmit queue to the specified per-queue + * statistics counter. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param tx_queue_id + * The index of the transmit queue for which a queue stats mapping is required. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param stat_idx + * The per-queue packet statistics functionality number that the transmit + * queue is to be assigned. + * The value must be in the range [0, RTE_ETHDEV_QUEUE_STAT_CNTRS - 1]. + * @return + * Zero if successful. Non-zero otherwise. + */ +int rte_eth_dev_set_tx_queue_stats_mapping(uint16_t port_id, + uint16_t tx_queue_id, uint8_t stat_idx); + +/** + * Set a mapping for the specified receive queue to the specified per-queue + * statistics counter. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param rx_queue_id + * The index of the receive queue for which a queue stats mapping is required. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param stat_idx + * The per-queue packet statistics functionality number that the receive + * queue is to be assigned. + * The value must be in the range [0, RTE_ETHDEV_QUEUE_STAT_CNTRS - 1]. + * @return + * Zero if successful. Non-zero otherwise. + */ +int rte_eth_dev_set_rx_queue_stats_mapping(uint16_t port_id, + uint16_t rx_queue_id, + uint8_t stat_idx); + +/** + * Retrieve the Ethernet address of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mac_addr + * A pointer to a structure of type *ether_addr* to be filled with + * the Ethernet address of the Ethernet device. + */ +void rte_eth_macaddr_get(uint16_t port_id, struct ether_addr *mac_addr); + +/** + * Retrieve the contextual information of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param dev_info + * A pointer to a structure of type *rte_eth_dev_info* to be filled with + * the contextual information of the Ethernet device. + */ +void rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info); + +/** + * Retrieve the firmware version of a device. + * + * @param port_id + * The port identifier of the device. + * @param fw_version + * A pointer to a string array storing the firmware version of a device, + * the string includes terminating null. This pointer is allocated by caller. + * @param fw_size + * The size of the string array pointed by fw_version, which should be + * large enough to store firmware version of the device. + * @return + * - (0) if successful. + * - (-ENOTSUP) if operation is not supported. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (>0) if *fw_size* is not enough to store firmware version, return + * the size of the non truncated string. + */ +int rte_eth_dev_fw_version_get(uint16_t port_id, + char *fw_version, size_t fw_size); + +/** + * Retrieve the supported packet types of an Ethernet device. + * + * When a packet type is announced as supported, it *must* be recognized by + * the PMD. For instance, if RTE_PTYPE_L2_ETHER, RTE_PTYPE_L2_ETHER_VLAN + * and RTE_PTYPE_L3_IPV4 are announced, the PMD must return the following + * packet types for these packets: + * - Ether/IPv4 -> RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4 + * - Ether/Vlan/IPv4 -> RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L3_IPV4 + * - Ether/[anything else] -> RTE_PTYPE_L2_ETHER + * - Ether/Vlan/[anything else] -> RTE_PTYPE_L2_ETHER_VLAN + * + * When a packet is received by a PMD, the most precise type must be + * returned among the ones supported. However a PMD is allowed to set + * packet type that is not in the supported list, at the condition that it + * is more precise. Therefore, a PMD announcing no supported packet types + * can still set a matching packet type in a received packet. + * + * @note + * Better to invoke this API after the device is already started or rx burst + * function is decided, to obtain correct supported ptypes. + * @note + * if a given PMD does not report what ptypes it supports, then the supported + * ptype count is reported as 0. + * @param port_id + * The port identifier of the Ethernet device. + * @param ptype_mask + * A hint of what kind of packet type which the caller is interested in. + * @param ptypes + * An array pointer to store adequate packet types, allocated by caller. + * @param num + * Size of the array pointed by param ptypes. + * @return + * - (>=0) Number of supported ptypes. If the number of types exceeds num, + * only num entries will be filled into the ptypes array, but the full + * count of supported ptypes will be returned. + * - (-ENODEV) if *port_id* invalid. + */ +int rte_eth_dev_get_supported_ptypes(uint16_t port_id, uint32_t ptype_mask, + uint32_t *ptypes, int num); + +/** + * Retrieve the MTU of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mtu + * A pointer to a uint16_t where the retrieved MTU is to be stored. + * @return + * - (0) if successful. + * - (-ENODEV) if *port_id* invalid. + */ +int rte_eth_dev_get_mtu(uint16_t port_id, uint16_t *mtu); + +/** + * Change the MTU of an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mtu + * A uint16_t for the MTU to be applied. + * @return + * - (0) if successful. + * - (-ENOTSUP) if operation is not supported. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-EINVAL) if *mtu* invalid. + * - (-EBUSY) if operation is not allowed when the port is running + */ +int rte_eth_dev_set_mtu(uint16_t port_id, uint16_t mtu); + +/** + * Enable/Disable hardware filtering by an Ethernet device of received + * VLAN packets tagged with a given VLAN Tag Identifier. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param vlan_id + * The VLAN Tag Identifier whose filtering must be enabled or disabled. + * @param on + * If > 0, enable VLAN filtering of VLAN packets tagged with *vlan_id*. + * Otherwise, disable VLAN filtering of VLAN packets tagged with *vlan_id*. + * @return + * - (0) if successful. + * - (-ENOSUP) if hardware-assisted VLAN filtering not configured. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-ENOSYS) if VLAN filtering on *port_id* disabled. + * - (-EINVAL) if *vlan_id* > 4095. + */ +int rte_eth_dev_vlan_filter(uint16_t port_id, uint16_t vlan_id, int on); + +/** + * Enable/Disable hardware VLAN Strip by a rx queue of an Ethernet device. + * 82599/X540/X550 can support VLAN stripping at the rx queue level + * + * @param port_id + * The port identifier of the Ethernet device. + * @param rx_queue_id + * The index of the receive queue for which a queue stats mapping is required. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param on + * If 1, Enable VLAN Stripping of the receive queue of the Ethernet port. + * If 0, Disable VLAN Stripping of the receive queue of the Ethernet port. + * @return + * - (0) if successful. + * - (-ENOSUP) if hardware-assisted VLAN stripping not configured. + * - (-ENODEV) if *port_id* invalid. + * - (-EINVAL) if *rx_queue_id* invalid. + */ +int rte_eth_dev_set_vlan_strip_on_queue(uint16_t port_id, uint16_t rx_queue_id, + int on); + +/** + * Set the Outer VLAN Ether Type by an Ethernet device, it can be inserted to + * the VLAN Header. This is a register setup available on some Intel NIC, not + * but all, please check the data sheet for availability. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param vlan_type + * The vlan type. + * @param tag_type + * The Tag Protocol ID + * @return + * - (0) if successful. + * - (-ENOSUP) if hardware-assisted VLAN TPID setup is not supported. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_set_vlan_ether_type(uint16_t port_id, + enum rte_vlan_type vlan_type, + uint16_t tag_type); + +/** + * Set VLAN offload configuration on an Ethernet device + * Enable/Disable Extended VLAN by an Ethernet device, This is a register setup + * available on some Intel NIC, not but all, please check the data sheet for + * availability. + * Enable/Disable VLAN Strip can be done on rx queue for certain NIC, but here + * the configuration is applied on the port level. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param offload_mask + * The VLAN Offload bit mask can be mixed use with "OR" + * ETH_VLAN_STRIP_OFFLOAD + * ETH_VLAN_FILTER_OFFLOAD + * ETH_VLAN_EXTEND_OFFLOAD + * @return + * - (0) if successful. + * - (-ENOSUP) if hardware-assisted VLAN filtering not configured. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_set_vlan_offload(uint16_t port_id, int offload_mask); + +/** + * Read VLAN Offload configuration from an Ethernet device + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (>0) if successful. Bit mask to indicate + * ETH_VLAN_STRIP_OFFLOAD + * ETH_VLAN_FILTER_OFFLOAD + * ETH_VLAN_EXTEND_OFFLOAD + * - (-ENODEV) if *port_id* invalid. + */ +int rte_eth_dev_get_vlan_offload(uint16_t port_id); + +/** + * Set port based TX VLAN insertion on or off. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param pvid + * Port based TX VLAN identifier together with user priority. + * @param on + * Turn on or off the port based TX VLAN insertion. + * + * @return + * - (0) if successful. + * - negative if failed. + */ +int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on); + +typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count, + void *userdata); + +/** + * Structure used to buffer packets for future TX + * Used by APIs rte_eth_tx_buffer and rte_eth_tx_buffer_flush + */ +struct rte_eth_dev_tx_buffer { + buffer_tx_error_fn error_callback; + void *error_userdata; + uint16_t size; /**< Size of buffer for buffered tx */ + uint16_t length; /**< Number of packets in the array */ + struct rte_mbuf *pkts[]; + /**< Pending packets to be sent on explicit flush or when full */ +}; + +/** + * Calculate the size of the tx buffer. + * + * @param sz + * Number of stored packets. + */ +#define RTE_ETH_TX_BUFFER_SIZE(sz) \ + (sizeof(struct rte_eth_dev_tx_buffer) + (sz) * sizeof(struct rte_mbuf *)) + +/** + * Initialize default values for buffered transmitting + * + * @param buffer + * Tx buffer to be initialized. + * @param size + * Buffer size + * @return + * 0 if no error + */ +int +rte_eth_tx_buffer_init(struct rte_eth_dev_tx_buffer *buffer, uint16_t size); + +/** + * Configure a callback for buffered packets which cannot be sent + * + * Register a specific callback to be called when an attempt is made to send + * all packets buffered on an ethernet port, but not all packets can + * successfully be sent. The callback registered here will be called only + * from calls to rte_eth_tx_buffer() and rte_eth_tx_buffer_flush() APIs. + * The default callback configured for each queue by default just frees the + * packets back to the calling mempool. If additional behaviour is required, + * for example, to count dropped packets, or to retry transmission of packets + * which cannot be sent, this function should be used to register a suitable + * callback function to implement the desired behaviour. + * The example callback "rte_eth_count_unsent_packet_callback()" is also + * provided as reference. + * + * @param buffer + * The port identifier of the Ethernet device. + * @param callback + * The function to be used as the callback. + * @param userdata + * Arbitrary parameter to be passed to the callback function + * @return + * 0 on success, or -1 on error with rte_errno set appropriately + */ +int +rte_eth_tx_buffer_set_err_callback(struct rte_eth_dev_tx_buffer *buffer, + buffer_tx_error_fn callback, void *userdata); + +/** + * Callback function for silently dropping unsent buffered packets. + * + * This function can be passed to rte_eth_tx_buffer_set_err_callback() to + * adjust the default behavior when buffered packets cannot be sent. This + * function drops any unsent packets silently and is used by tx buffered + * operations as default behavior. + * + * NOTE: this function should not be called directly, instead it should be used + * as a callback for packet buffering. + * + * NOTE: when configuring this function as a callback with + * rte_eth_tx_buffer_set_err_callback(), the final, userdata parameter + * should point to an uint64_t value. + * + * @param pkts + * The previously buffered packets which could not be sent + * @param unsent + * The number of unsent packets in the pkts array + * @param userdata + * Not used + */ +void +rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t unsent, + void *userdata); + +/** + * Callback function for tracking unsent buffered packets. + * + * This function can be passed to rte_eth_tx_buffer_set_err_callback() to + * adjust the default behavior when buffered packets cannot be sent. This + * function drops any unsent packets, but also updates a user-supplied counter + * to track the overall number of packets dropped. The counter should be an + * uint64_t variable. + * + * NOTE: this function should not be called directly, instead it should be used + * as a callback for packet buffering. + * + * NOTE: when configuring this function as a callback with + * rte_eth_tx_buffer_set_err_callback(), the final, userdata parameter + * should point to an uint64_t value. + * + * @param pkts + * The previously buffered packets which could not be sent + * @param unsent + * The number of unsent packets in the pkts array + * @param userdata + * Pointer to an uint64_t value, which will be incremented by unsent + */ +void +rte_eth_tx_buffer_count_callback(struct rte_mbuf **pkts, uint16_t unsent, + void *userdata); + +/** + * Request the driver to free mbufs currently cached by the driver. The + * driver will only free the mbuf if it is no longer in use. It is the + * application's responsibity to ensure rte_eth_tx_buffer_flush(..) is + * called if needed. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the transmit queue through which output packets must be + * sent. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param free_cnt + * Maximum number of packets to free. Use 0 to indicate all possible packets + * should be freed. Note that a packet may be using multiple mbufs. + * @return + * Failure: < 0 + * -ENODEV: Invalid interface + * -EIO: device is removed + * -ENOTSUP: Driver does not support function + * Success: >= 0 + * 0-n: Number of packets freed. More packets may still remain in ring that + * are in use. + */ +int +rte_eth_tx_done_cleanup(uint16_t port_id, uint16_t queue_id, uint32_t free_cnt); + +/** + * Subtypes for IPsec offload event(@ref RTE_ETH_EVENT_IPSEC) raised by + * eth device. + */ +enum rte_eth_event_ipsec_subtype { + RTE_ETH_EVENT_IPSEC_UNKNOWN = 0, + /**< Unknown event type */ + RTE_ETH_EVENT_IPSEC_ESN_OVERFLOW, + /**< Sequence number overflow */ + RTE_ETH_EVENT_IPSEC_SA_TIME_EXPIRY, + /**< Soft time expiry of SA */ + RTE_ETH_EVENT_IPSEC_SA_BYTE_EXPIRY, + /**< Soft byte expiry of SA */ + RTE_ETH_EVENT_IPSEC_MAX + /**< Max value of this enum */ +}; + +/** + * Descriptor for @ref RTE_ETH_EVENT_IPSEC event. Used by eth dev to send extra + * information of the IPsec offload event. + */ +struct rte_eth_event_ipsec_desc { + enum rte_eth_event_ipsec_subtype subtype; + /**< Type of RTE_ETH_EVENT_IPSEC_* event */ + uint64_t metadata; + /**< Event specific metadata + * + * For the following events, *userdata* registered + * with the *rte_security_session* would be returned + * as metadata, + * + * - @ref RTE_ETH_EVENT_IPSEC_ESN_OVERFLOW + * - @ref RTE_ETH_EVENT_IPSEC_SA_TIME_EXPIRY + * - @ref RTE_ETH_EVENT_IPSEC_SA_BYTE_EXPIRY + * + * @see struct rte_security_session_conf + * + */ +}; + +/** + * The eth device event type for interrupt, and maybe others in the future. + */ +enum rte_eth_event_type { + RTE_ETH_EVENT_UNKNOWN, /**< unknown event type */ + RTE_ETH_EVENT_INTR_LSC, /**< lsc interrupt event */ + RTE_ETH_EVENT_QUEUE_STATE, + /**< queue state event (enabled/disabled) */ + RTE_ETH_EVENT_INTR_RESET, + /**< reset interrupt event, sent to VF on PF reset */ + RTE_ETH_EVENT_VF_MBOX, /**< message from the VF received by PF */ + RTE_ETH_EVENT_MACSEC, /**< MACsec offload related event */ + RTE_ETH_EVENT_INTR_RMV, /**< device removal event */ + RTE_ETH_EVENT_NEW, /**< port is probed */ + RTE_ETH_EVENT_DESTROY, /**< port is released */ + RTE_ETH_EVENT_IPSEC, /**< IPsec offload related event */ + RTE_ETH_EVENT_MAX /**< max value of this enum */ +}; + +typedef int (*rte_eth_dev_cb_fn)(uint16_t port_id, + enum rte_eth_event_type event, void *cb_arg, void *ret_param); +/**< user application callback to be registered for interrupts */ + +/** + * Register a callback function for port event. + * + * @param port_id + * Port id. + * RTE_ETH_ALL means register the event for all port ids. + * @param event + * Event interested. + * @param cb_fn + * User supplied callback function to be called. + * @param cb_arg + * Pointer to the parameters for the registered callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_eth_dev_callback_register(uint16_t port_id, + enum rte_eth_event_type event, + rte_eth_dev_cb_fn cb_fn, void *cb_arg); + +/** + * Unregister a callback function for port event. + * + * @param port_id + * Port id. + * RTE_ETH_ALL means unregister the event for all port ids. + * @param event + * Event interested. + * @param cb_fn + * User supplied callback function to be called. + * @param cb_arg + * Pointer to the parameters for the registered callback. -1 means to + * remove all for the same callback address and same event. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_eth_dev_callback_unregister(uint16_t port_id, + enum rte_eth_event_type event, + rte_eth_dev_cb_fn cb_fn, void *cb_arg); + +/** + * When there is no rx packet coming in Rx Queue for a long time, we can + * sleep lcore related to RX Queue for power saving, and enable rx interrupt + * to be triggered when Rx packet arrives. + * + * The rte_eth_dev_rx_intr_enable() function enables rx queue + * interrupt on specific rx queue of a port. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the receive queue from which to retrieve input packets. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - (0) if successful. + * - (-ENOTSUP) if underlying hardware OR driver doesn't support + * that operation. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_rx_intr_enable(uint16_t port_id, uint16_t queue_id); + +/** + * When lcore wakes up from rx interrupt indicating packet coming, disable rx + * interrupt and returns to polling mode. + * + * The rte_eth_dev_rx_intr_disable() function disables rx queue + * interrupt on specific rx queue of a port. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the receive queue from which to retrieve input packets. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @return + * - (0) if successful. + * - (-ENOTSUP) if underlying hardware OR driver doesn't support + * that operation. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_rx_intr_disable(uint16_t port_id, uint16_t queue_id); + +/** + * RX Interrupt control per port. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param epfd + * Epoll instance fd which the intr vector associated to. + * Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance. + * @param op + * The operation be performed for the vector. + * Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}. + * @param data + * User raw data. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data); + +/** + * RX Interrupt control per queue. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the receive queue from which to retrieve input packets. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param epfd + * Epoll instance fd which the intr vector associated to. + * Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance. + * @param op + * The operation be performed for the vector. + * Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}. + * @param data + * User raw data. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int rte_eth_dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id, + int epfd, int op, void *data); + +/** + * Turn on the LED on the Ethernet device. + * This function turns on the LED on the Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (0) if successful. + * - (-ENOTSUP) if underlying hardware OR driver doesn't support + * that operation. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_led_on(uint16_t port_id); + +/** + * Turn off the LED on the Ethernet device. + * This function turns off the LED on the Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (0) if successful. + * - (-ENOTSUP) if underlying hardware OR driver doesn't support + * that operation. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_led_off(uint16_t port_id); + +/** + * Get current status of the Ethernet link flow control for Ethernet device + * + * @param port_id + * The port identifier of the Ethernet device. + * @param fc_conf + * The pointer to the structure where to store the flow control parameters. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support flow control. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_flow_ctrl_get(uint16_t port_id, + struct rte_eth_fc_conf *fc_conf); + +/** + * Configure the Ethernet link flow control for Ethernet device + * + * @param port_id + * The port identifier of the Ethernet device. + * @param fc_conf + * The pointer to the structure of the flow control parameters. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support flow control mode. + * - (-ENODEV) if *port_id* invalid. + * - (-EINVAL) if bad parameter + * - (-EIO) if flow control setup failure or device is removed. + */ +int rte_eth_dev_flow_ctrl_set(uint16_t port_id, + struct rte_eth_fc_conf *fc_conf); + +/** + * Configure the Ethernet priority flow control under DCB environment + * for Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param pfc_conf + * The pointer to the structure of the priority flow control parameters. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support priority flow control mode. + * - (-ENODEV) if *port_id* invalid. + * - (-EINVAL) if bad parameter + * - (-EIO) if flow control setup failure or device is removed. + */ +int rte_eth_dev_priority_flow_ctrl_set(uint16_t port_id, + struct rte_eth_pfc_conf *pfc_conf); + +/** + * Add a MAC address to an internal array of addresses used to enable whitelist + * filtering to accept packets only if the destination MAC address matches. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mac_addr + * The MAC address to add. + * @param pool + * VMDq pool index to associate address with (if VMDq is enabled). If VMDq is + * not enabled, this should be set to 0. + * @return + * - (0) if successfully added or *mac_addr* was already added. + * - (-ENOTSUP) if hardware doesn't support this feature. + * - (-ENODEV) if *port* is invalid. + * - (-EIO) if device is removed. + * - (-ENOSPC) if no more MAC addresses can be added. + * - (-EINVAL) if MAC address is invalid. + */ +int rte_eth_dev_mac_addr_add(uint16_t port_id, struct ether_addr *mac_addr, + uint32_t pool); + +/** + * Remove a MAC address from the internal array of addresses. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mac_addr + * MAC address to remove. + * @return + * - (0) if successful, or *mac_addr* didn't exist. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port* invalid. + * - (-EADDRINUSE) if attempting to remove the default MAC address + */ +int rte_eth_dev_mac_addr_remove(uint16_t port_id, struct ether_addr *mac_addr); + +/** + * Set the default MAC address. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mac_addr + * New default MAC address. + * @return + * - (0) if successful, or *mac_addr* didn't exist. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port* invalid. + * - (-EINVAL) if MAC address is invalid. + */ +int rte_eth_dev_default_mac_addr_set(uint16_t port_id, + struct ether_addr *mac_addr); + +/** + * Update Redirection Table(RETA) of Receive Side Scaling of Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param reta_conf + * RETA to update. + * @param reta_size + * Redirection table size. The table size can be queried by + * rte_eth_dev_info_get(). + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-EINVAL) if bad parameter. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_rss_reta_update(uint16_t port_id, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); + + /** + * Query Redirection Table(RETA) of Receive Side Scaling of Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param reta_conf + * RETA to query. + * @param reta_size + * Redirection table size. The table size can be queried by + * rte_eth_dev_info_get(). + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-EINVAL) if bad parameter. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_rss_reta_query(uint16_t port_id, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); + + /** + * Updates unicast hash table for receiving packet with the given destination + * MAC address, and the packet is routed to all VFs for which the RX mode is + * accept packets that match the unicast hash table. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param addr + * Unicast MAC address. + * @param on + * 1 - Set an unicast hash bit for receiving packets with the MAC address. + * 0 - Clear an unicast hash bit. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-EINVAL) if bad parameter. + */ +int rte_eth_dev_uc_hash_table_set(uint16_t port_id, struct ether_addr *addr, + uint8_t on); + + /** + * Updates all unicast hash bitmaps for receiving packet with any Unicast + * Ethernet MAC addresses,the packet is routed to all VFs for which the RX + * mode is accept packets that match the unicast hash table. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param on + * 1 - Set all unicast hash bitmaps for receiving all the Ethernet + * MAC addresses + * 0 - Clear all unicast hash bitmaps + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-EINVAL) if bad parameter. + */ +int rte_eth_dev_uc_all_hash_table_set(uint16_t port_id, uint8_t on); + +/** + * Set a traffic mirroring rule on an Ethernet device + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mirror_conf + * The pointer to the traffic mirroring structure describing the mirroring rule. + * The *rte_eth_vm_mirror_conf* structure includes the type of mirroring rule, + * destination pool and the value of rule if enable vlan or pool mirroring. + * + * @param rule_id + * The index of traffic mirroring rule, we support four separated rules. + * @param on + * 1 - Enable a mirroring rule. + * 0 - Disable a mirroring rule. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support this feature. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-EINVAL) if the mr_conf information is not correct. + */ +int rte_eth_mirror_rule_set(uint16_t port_id, + struct rte_eth_mirror_conf *mirror_conf, + uint8_t rule_id, + uint8_t on); + +/** + * Reset a traffic mirroring rule on an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param rule_id + * The index of traffic mirroring rule, we support four separated rules. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support this feature. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-EINVAL) if bad parameter. + */ +int rte_eth_mirror_rule_reset(uint16_t port_id, + uint8_t rule_id); + +/** + * Set the rate limitation for a queue on an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_idx + * The queue id. + * @param tx_rate + * The tx rate in Mbps. Allocated from the total port link speed. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support this feature. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-EINVAL) if bad parameter. + */ +int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx, + uint16_t tx_rate); + + /** + * Configuration of Receive Side Scaling hash computation of Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param rss_conf + * The new configuration to use for RSS hash computation on the port. + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support. + * - (-EINVAL) if bad parameter. + */ +int rte_eth_dev_rss_hash_update(uint16_t port_id, + struct rte_eth_rss_conf *rss_conf); + + /** + * Retrieve current configuration of Receive Side Scaling hash computation + * of Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param rss_conf + * Where to store the current RSS hash configuration of the Ethernet device. + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support RSS. + */ +int +rte_eth_dev_rss_hash_conf_get(uint16_t port_id, + struct rte_eth_rss_conf *rss_conf); + + /** + * Add UDP tunneling port for a specific type of tunnel. + * The packets with this UDP port will be identified as this type of tunnel. + * Before enabling any offloading function for a tunnel, users can call this API + * to change or add more UDP port for the tunnel. So the offloading function + * can take effect on the packets with the specific UDP port. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param tunnel_udp + * UDP tunneling configuration. + * + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support tunnel type. + */ +int +rte_eth_dev_udp_tunnel_port_add(uint16_t port_id, + struct rte_eth_udp_tunnel *tunnel_udp); + + /** + * Delete UDP tunneling port a specific type of tunnel. + * The packets with this UDP port will not be identified as this type of tunnel + * any more. + * Before enabling any offloading function for a tunnel, users can call this API + * to delete a UDP port for the tunnel. So the offloading function will not take + * effect on the packets with the specific UDP port. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param tunnel_udp + * UDP tunneling configuration. + * + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support tunnel type. + */ +int +rte_eth_dev_udp_tunnel_port_delete(uint16_t port_id, + struct rte_eth_udp_tunnel *tunnel_udp); + +/** + * Check whether the filter type is supported on an Ethernet device. + * All the supported filter types are defined in 'rte_eth_ctrl.h'. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param filter_type + * Filter type. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support this filter type. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + */ +int rte_eth_dev_filter_supported(uint16_t port_id, + enum rte_filter_type filter_type); + +/** + * Take operations to assigned filter type on an Ethernet device. + * All the supported operations and filter types are defined in 'rte_eth_ctrl.h'. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param filter_type + * Filter type. + * @param filter_op + * Type of operation. + * @param arg + * A pointer to arguments defined specifically for the operation. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int rte_eth_dev_filter_ctrl(uint16_t port_id, enum rte_filter_type filter_type, + enum rte_filter_op filter_op, void *arg); + +/** + * Get DCB information on an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param dcb_info + * dcb information. + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support. + */ +int rte_eth_dev_get_dcb_info(uint16_t port_id, + struct rte_eth_dcb_info *dcb_info); + +struct rte_eth_rxtx_callback; + +/** + * Add a callback to be called on packet RX on a given port and queue. + * + * This API configures a function to be called for each burst of + * packets received on a given NIC port queue. The return value is a pointer + * that can be used to later remove the callback using + * rte_eth_remove_rx_callback(). + * + * Multiple functions are called in the order that they are added. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue on the Ethernet device on which the callback is to be added. + * @param fn + * The callback function + * @param user_param + * A generic pointer parameter which will be passed to each invocation of the + * callback function on this port and queue. + * + * @return + * NULL on error. + * On success, a pointer value which can later be used to remove the callback. + */ +const struct rte_eth_rxtx_callback * +rte_eth_add_rx_callback(uint16_t port_id, uint16_t queue_id, + rte_rx_callback_fn fn, void *user_param); + +/** + * Add a callback that must be called first on packet RX on a given port + * and queue. + * + * This API configures a first function to be called for each burst of + * packets received on a given NIC port queue. The return value is a pointer + * that can be used to later remove the callback using + * rte_eth_remove_rx_callback(). + * + * Multiple functions are called in the order that they are added. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue on the Ethernet device on which the callback is to be added. + * @param fn + * The callback function + * @param user_param + * A generic pointer parameter which will be passed to each invocation of the + * callback function on this port and queue. + * + * @return + * NULL on error. + * On success, a pointer value which can later be used to remove the callback. + */ +const struct rte_eth_rxtx_callback * +rte_eth_add_first_rx_callback(uint16_t port_id, uint16_t queue_id, + rte_rx_callback_fn fn, void *user_param); + +/** + * Add a callback to be called on packet TX on a given port and queue. + * + * This API configures a function to be called for each burst of + * packets sent on a given NIC port queue. The return value is a pointer + * that can be used to later remove the callback using + * rte_eth_remove_tx_callback(). + * + * Multiple functions are called in the order that they are added. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue on the Ethernet device on which the callback is to be added. + * @param fn + * The callback function + * @param user_param + * A generic pointer parameter which will be passed to each invocation of the + * callback function on this port and queue. + * + * @return + * NULL on error. + * On success, a pointer value which can later be used to remove the callback. + */ +const struct rte_eth_rxtx_callback * +rte_eth_add_tx_callback(uint16_t port_id, uint16_t queue_id, + rte_tx_callback_fn fn, void *user_param); + +/** + * Remove an RX packet callback from a given port and queue. + * + * This function is used to removed callbacks that were added to a NIC port + * queue using rte_eth_add_rx_callback(). + * + * Note: the callback is removed from the callback list but it isn't freed + * since the it may still be in use. The memory for the callback can be + * subsequently freed back by the application by calling rte_free(): + * + * - Immediately - if the port is stopped, or the user knows that no + * callbacks are in flight e.g. if called from the thread doing RX/TX + * on that queue. + * + * - After a short delay - where the delay is sufficient to allow any + * in-flight callbacks to complete. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue on the Ethernet device from which the callback is to be removed. + * @param user_cb + * User supplied callback created via rte_eth_add_rx_callback(). + * + * @return + * - 0: Success. Callback was removed. + * - -ENOTSUP: Callback support is not available. + * - -EINVAL: The port_id or the queue_id is out of range, or the callback + * is NULL or not found for the port/queue. + */ +int rte_eth_remove_rx_callback(uint16_t port_id, uint16_t queue_id, + const struct rte_eth_rxtx_callback *user_cb); + +/** + * Remove a TX packet callback from a given port and queue. + * + * This function is used to removed callbacks that were added to a NIC port + * queue using rte_eth_add_tx_callback(). + * + * Note: the callback is removed from the callback list but it isn't freed + * since the it may still be in use. The memory for the callback can be + * subsequently freed back by the application by calling rte_free(): + * + * - Immediately - if the port is stopped, or the user knows that no + * callbacks are in flight e.g. if called from the thread doing RX/TX + * on that queue. + * + * - After a short delay - where the delay is sufficient to allow any + * in-flight callbacks to complete. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue on the Ethernet device from which the callback is to be removed. + * @param user_cb + * User supplied callback created via rte_eth_add_tx_callback(). + * + * @return + * - 0: Success. Callback was removed. + * - -ENOTSUP: Callback support is not available. + * - -EINVAL: The port_id or the queue_id is out of range, or the callback + * is NULL or not found for the port/queue. + */ +int rte_eth_remove_tx_callback(uint16_t port_id, uint16_t queue_id, + const struct rte_eth_rxtx_callback *user_cb); + +/** + * Retrieve information about given port's RX queue. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The RX queue on the Ethernet device for which information + * will be retrieved. + * @param qinfo + * A pointer to a structure of type *rte_eth_rxq_info_info* to be filled with + * the information of the Ethernet device. + * + * @return + * - 0: Success + * - -ENOTSUP: routine is not supported by the device PMD. + * - -EINVAL: The port_id or the queue_id is out of range. + */ +int rte_eth_rx_queue_info_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_rxq_info *qinfo); + +/** + * Retrieve information about given port's TX queue. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The TX queue on the Ethernet device for which information + * will be retrieved. + * @param qinfo + * A pointer to a structure of type *rte_eth_txq_info_info* to be filled with + * the information of the Ethernet device. + * + * @return + * - 0: Success + * - -ENOTSUP: routine is not supported by the device PMD. + * - -EINVAL: The port_id or the queue_id is out of range. + */ +int rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id, + struct rte_eth_txq_info *qinfo); + +/** + * Retrieve device registers and register attributes (number of registers and + * register size) + * + * @param port_id + * The port identifier of the Ethernet device. + * @param info + * Pointer to rte_dev_reg_info structure to fill in. If info->data is + * NULL the function fills in the width and length fields. If non-NULL + * the registers are put into the buffer pointed at by the data field. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int rte_eth_dev_get_reg_info(uint16_t port_id, struct rte_dev_reg_info *info); + +/** + * Retrieve size of device EEPROM + * + * @param port_id + * The port identifier of the Ethernet device. + * @return + * - (>=0) EEPROM size if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int rte_eth_dev_get_eeprom_length(uint16_t port_id); + +/** + * Retrieve EEPROM and EEPROM attribute + * + * @param port_id + * The port identifier of the Ethernet device. + * @param info + * The template includes buffer for return EEPROM data and + * EEPROM attributes to be filled. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int rte_eth_dev_get_eeprom(uint16_t port_id, struct rte_dev_eeprom_info *info); + +/** + * Program EEPROM with provided data + * + * @param port_id + * The port identifier of the Ethernet device. + * @param info + * The template includes EEPROM data for programming and + * EEPROM attributes to be filled + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int rte_eth_dev_set_eeprom(uint16_t port_id, struct rte_dev_eeprom_info *info); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Retrieve the type and size of plugin module EEPROM + * + * @param port_id + * The port identifier of the Ethernet device. + * @param modinfo + * The type and size of plugin module EEPROM. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int __rte_experimental +rte_eth_dev_get_module_info(uint16_t port_id, + struct rte_eth_dev_module_info *modinfo); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Retrieve the data of plugin module EEPROM + * + * @param port_id + * The port identifier of the Ethernet device. + * @param info + * The template includes the plugin module EEPROM attributes, and the + * buffer for return plugin module EEPROM data. + * @return + * - (0) if successful. + * - (-ENOTSUP) if hardware doesn't support. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - others depends on the specific operations implementation. + */ +int __rte_experimental +rte_eth_dev_get_module_eeprom(uint16_t port_id, + struct rte_dev_eeprom_info *info); + +/** + * Set the list of multicast addresses to filter on an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param mc_addr_set + * The array of multicast addresses to set. Equal to NULL when the function + * is invoked to flush the set of filtered addresses. + * @param nb_mc_addr + * The number of multicast addresses in the *mc_addr_set* array. Equal to 0 + * when the function is invoked to flush the set of filtered addresses. + * @return + * - (0) if successful. + * - (-ENODEV) if *port_id* invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if PMD of *port_id* doesn't support multicast filtering. + * - (-ENOSPC) if *port_id* has not enough multicast filtering resources. + */ +int rte_eth_dev_set_mc_addr_list(uint16_t port_id, + struct ether_addr *mc_addr_set, + uint32_t nb_mc_addr); + +/** + * Enable IEEE1588/802.1AS timestamping for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * + * @return + * - 0: Success. + * - -ENODEV: The port ID is invalid. + * - -EIO: if device is removed. + * - -ENOTSUP: The function is not supported by the Ethernet driver. + */ +int rte_eth_timesync_enable(uint16_t port_id); + +/** + * Disable IEEE1588/802.1AS timestamping for an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * + * @return + * - 0: Success. + * - -ENODEV: The port ID is invalid. + * - -EIO: if device is removed. + * - -ENOTSUP: The function is not supported by the Ethernet driver. + */ +int rte_eth_timesync_disable(uint16_t port_id); + +/** + * Read an IEEE1588/802.1AS RX timestamp from an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param timestamp + * Pointer to the timestamp struct. + * @param flags + * Device specific flags. Used to pass the RX timesync register index to + * i40e. Unused in igb/ixgbe, pass 0 instead. + * + * @return + * - 0: Success. + * - -EINVAL: No timestamp is available. + * - -ENODEV: The port ID is invalid. + * - -EIO: if device is removed. + * - -ENOTSUP: The function is not supported by the Ethernet driver. + */ +int rte_eth_timesync_read_rx_timestamp(uint16_t port_id, + struct timespec *timestamp, uint32_t flags); + +/** + * Read an IEEE1588/802.1AS TX timestamp from an Ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param timestamp + * Pointer to the timestamp struct. + * + * @return + * - 0: Success. + * - -EINVAL: No timestamp is available. + * - -ENODEV: The port ID is invalid. + * - -EIO: if device is removed. + * - -ENOTSUP: The function is not supported by the Ethernet driver. + */ +int rte_eth_timesync_read_tx_timestamp(uint16_t port_id, + struct timespec *timestamp); + +/** + * Adjust the timesync clock on an Ethernet device. + * + * This is usually used in conjunction with other Ethdev timesync functions to + * synchronize the device time using the IEEE1588/802.1AS protocol. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param delta + * The adjustment in nanoseconds. + * + * @return + * - 0: Success. + * - -ENODEV: The port ID is invalid. + * - -EIO: if device is removed. + * - -ENOTSUP: The function is not supported by the Ethernet driver. + */ +int rte_eth_timesync_adjust_time(uint16_t port_id, int64_t delta); + +/** + * Read the time from the timesync clock on an Ethernet device. + * + * This is usually used in conjunction with other Ethdev timesync functions to + * synchronize the device time using the IEEE1588/802.1AS protocol. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param time + * Pointer to the timespec struct that holds the time. + * + * @return + * - 0: Success. + */ +int rte_eth_timesync_read_time(uint16_t port_id, struct timespec *time); + +/** + * Set the time of the timesync clock on an Ethernet device. + * + * This is usually used in conjunction with other Ethdev timesync functions to + * synchronize the device time using the IEEE1588/802.1AS protocol. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param time + * Pointer to the timespec struct that holds the time. + * + * @return + * - 0: Success. + * - -EINVAL: No timestamp is available. + * - -ENODEV: The port ID is invalid. + * - -EIO: if device is removed. + * - -ENOTSUP: The function is not supported by the Ethernet driver. + */ +int rte_eth_timesync_write_time(uint16_t port_id, const struct timespec *time); + +/** + * Config l2 tunnel ether type of an Ethernet device for filtering specific + * tunnel packets by ether type. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param l2_tunnel + * l2 tunnel configuration. + * + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support tunnel type. + */ +int +rte_eth_dev_l2_tunnel_eth_type_conf(uint16_t port_id, + struct rte_eth_l2_tunnel_conf *l2_tunnel); + +/** + * Enable/disable l2 tunnel offload functions. Include, + * 1, The ability of parsing a type of l2 tunnel of an Ethernet device. + * Filtering, forwarding and offloading this type of tunnel packets depend on + * this ability. + * 2, Stripping the l2 tunnel tag. + * 3, Insertion of the l2 tunnel tag. + * 4, Forwarding the packets based on the l2 tunnel tag. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param l2_tunnel + * l2 tunnel parameters. + * @param mask + * Indicate the offload function. + * @param en + * Enable or disable this function. + * + * @return + * - (0) if successful. + * - (-ENODEV) if port identifier is invalid. + * - (-EIO) if device is removed. + * - (-ENOTSUP) if hardware doesn't support tunnel type. + */ +int +rte_eth_dev_l2_tunnel_offload_set(uint16_t port_id, + struct rte_eth_l2_tunnel_conf *l2_tunnel, + uint32_t mask, + uint8_t en); + +/** +* Get the port id from device name. The device name should be specified +* as below: +* - PCIe address (Domain:Bus:Device.Function), for example- 0000:2:00.0 +* - SoC device name, for example- fsl-gmac0 +* - vdev dpdk name, for example- net_[pcap0|null0|tap0] +* +* @param name +* pci address or name of the device +* @param port_id +* pointer to port identifier of the device +* @return +* - (0) if successful and port_id is filled. +* - (-ENODEV or -EINVAL) on failure. +*/ +int +rte_eth_dev_get_port_by_name(const char *name, uint16_t *port_id); + +/** +* Get the device name from port id. The device name is specified as below: +* - PCIe address (Domain:Bus:Device.Function), for example- 0000:02:00.0 +* - SoC device name, for example- fsl-gmac0 +* - vdev dpdk name, for example- net_[pcap0|null0|tun0|tap0] +* +* @param port_id +* Port identifier of the device. +* @param name +* Buffer of size RTE_ETH_NAME_MAX_LEN to store the name. +* @return +* - (0) if successful. +* - (-EINVAL) on failure. +*/ +int +rte_eth_dev_get_name_by_port(uint16_t port_id, char *name); + +/** + * Check that numbers of Rx and Tx descriptors satisfy descriptors limits from + * the ethernet device information, otherwise adjust them to boundaries. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param nb_rx_desc + * A pointer to a uint16_t where the number of receive + * descriptors stored. + * @param nb_tx_desc + * A pointer to a uint16_t where the number of transmit + * descriptors stored. + * @return + * - (0) if successful. + * - (-ENOTSUP, -ENODEV or -EINVAL) on failure. + */ +int rte_eth_dev_adjust_nb_rx_tx_desc(uint16_t port_id, + uint16_t *nb_rx_desc, + uint16_t *nb_tx_desc); + +/** + * Test if a port supports specific mempool ops. + * + * @param port_id + * Port identifier of the Ethernet device. + * @param [in] pool + * The name of the pool operations to test. + * @return + * - 0: best mempool ops choice for this port. + * - 1: mempool ops are supported for this port. + * - -ENOTSUP: mempool ops not supported for this port. + * - -ENODEV: Invalid port Identifier. + * - -EINVAL: Pool param is null. + */ +int +rte_eth_dev_pool_ops_supported(uint16_t port_id, const char *pool); + +/** + * Get the security context for the Ethernet device. + * + * @param port_id + * Port identifier of the Ethernet device + * @return + * - NULL on error. + * - pointer to security context on success. + */ +void * +rte_eth_dev_get_sec_ctx(uint16_t port_id); + + +#include + +/** + * + * Retrieve a burst of input packets from a receive queue of an Ethernet + * device. The retrieved packets are stored in *rte_mbuf* structures whose + * pointers are supplied in the *rx_pkts* array. + * + * The rte_eth_rx_burst() function loops, parsing the RX ring of the + * receive queue, up to *nb_pkts* packets, and for each completed RX + * descriptor in the ring, it performs the following operations: + * + * - Initialize the *rte_mbuf* data structure associated with the + * RX descriptor according to the information provided by the NIC into + * that RX descriptor. + * + * - Store the *rte_mbuf* data structure into the next entry of the + * *rx_pkts* array. + * + * - Replenish the RX descriptor with a new *rte_mbuf* buffer + * allocated from the memory pool associated with the receive queue at + * initialization time. + * + * When retrieving an input packet that was scattered by the controller + * into multiple receive descriptors, the rte_eth_rx_burst() function + * appends the associated *rte_mbuf* buffers to the first buffer of the + * packet. + * + * The rte_eth_rx_burst() function returns the number of packets + * actually retrieved, which is the number of *rte_mbuf* data structures + * effectively supplied into the *rx_pkts* array. + * A return value equal to *nb_pkts* indicates that the RX queue contained + * at least *rx_pkts* packets, and this is likely to signify that other + * received packets remain in the input queue. Applications implementing + * a "retrieve as much received packets as possible" policy can check this + * specific case and keep invoking the rte_eth_rx_burst() function until + * a value less than *nb_pkts* is returned. + * + * This receive method has the following advantages: + * + * - It allows a run-to-completion network stack engine to retrieve and + * to immediately process received packets in a fast burst-oriented + * approach, avoiding the overhead of unnecessary intermediate packet + * queue/dequeue operations. + * + * - Conversely, it also allows an asynchronous-oriented processing + * method to retrieve bursts of received packets and to immediately + * queue them for further parallel processing by another logical core, + * for instance. However, instead of having received packets being + * individually queued by the driver, this approach allows the caller + * of the rte_eth_rx_burst() function to queue a burst of retrieved + * packets at a time and therefore dramatically reduce the cost of + * enqueue/dequeue operations per packet. + * + * - It allows the rte_eth_rx_burst() function of the driver to take + * advantage of burst-oriented hardware features (CPU cache, + * prefetch instructions, and so on) to minimize the number of CPU + * cycles per packet. + * + * To summarize, the proposed receive API enables many + * burst-oriented optimizations in both synchronous and asynchronous + * packet processing environments with no overhead in both cases. + * + * The rte_eth_rx_burst() function does not provide any error + * notification to avoid the corresponding overhead. As a hint, the + * upper-level application might check the status of the device link once + * being systematically returned a 0 value for a given number of tries. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the receive queue from which to retrieve input packets. + * The value must be in the range [0, nb_rx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param rx_pkts + * The address of an array of pointers to *rte_mbuf* structures that + * must be large enough to store *nb_pkts* pointers in it. + * @param nb_pkts + * The maximum number of packets to retrieve. + * @return + * The number of packets actually retrieved, which is the number + * of pointers to *rte_mbuf* structures effectively supplied to the + * *rx_pkts* array. + */ +static inline uint16_t +rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, + struct rte_mbuf **rx_pkts, const uint16_t nb_pkts) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + uint16_t nb_rx; + +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0); + RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0); + + if (queue_id >= dev->data->nb_rx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id); + return 0; + } +#endif + nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id], + rx_pkts, nb_pkts); + +#ifdef RTE_ETHDEV_RXTX_CALLBACKS + if (unlikely(dev->post_rx_burst_cbs[queue_id] != NULL)) { + struct rte_eth_rxtx_callback *cb = + dev->post_rx_burst_cbs[queue_id]; + + do { + nb_rx = cb->fn.rx(port_id, queue_id, rx_pkts, nb_rx, + nb_pkts, cb->param); + cb = cb->next; + } while (cb != NULL); + } +#endif + + return nb_rx; +} + +/** + * Get the number of used descriptors of a rx queue + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue id on the specific port. + * @return + * The number of used descriptors in the specific queue, or: + * (-EINVAL) if *port_id* or *queue_id* is invalid + * (-ENOTSUP) if the device does not support this function + */ +static inline int +rte_eth_rx_queue_count(uint16_t port_id, uint16_t queue_id) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_count, -ENOTSUP); + if (queue_id >= dev->data->nb_rx_queues) + return -EINVAL; + + return (int)(*dev->dev_ops->rx_queue_count)(dev, queue_id); +} + +/** + * Check if the DD bit of the specific RX descriptor in the queue has been set + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue id on the specific port. + * @param offset + * The offset of the descriptor ID from tail. + * @return + * - (1) if the specific DD bit is set. + * - (0) if the specific DD bit is not set. + * - (-ENODEV) if *port_id* invalid. + * - (-ENOTSUP) if the device does not support this function + */ +static inline int +rte_eth_rx_descriptor_done(uint16_t port_id, uint16_t queue_id, uint16_t offset) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_descriptor_done, -ENOTSUP); + return (*dev->dev_ops->rx_descriptor_done)( \ + dev->data->rx_queues[queue_id], offset); +} + +#define RTE_ETH_RX_DESC_AVAIL 0 /**< Desc available for hw. */ +#define RTE_ETH_RX_DESC_DONE 1 /**< Desc done, filled by hw. */ +#define RTE_ETH_RX_DESC_UNAVAIL 2 /**< Desc used by driver or hw. */ + +/** + * Check the status of a Rx descriptor in the queue + * + * It should be called in a similar context than the Rx function: + * - on a dataplane core + * - not concurrently on the same queue + * + * Since it's a dataplane function, no check is performed on port_id and + * queue_id. The caller must therefore ensure that the port is enabled + * and the queue is configured and running. + * + * Note: accessing to a random descriptor in the ring may trigger cache + * misses and have a performance impact. + * + * @param port_id + * A valid port identifier of the Ethernet device which. + * @param queue_id + * A valid Rx queue identifier on this port. + * @param offset + * The offset of the descriptor starting from tail (0 is the next + * packet to be received by the driver). + * + * @return + * - (RTE_ETH_RX_DESC_AVAIL): Descriptor is available for the hardware to + * receive a packet. + * - (RTE_ETH_RX_DESC_DONE): Descriptor is done, it is filled by hw, but + * not yet processed by the driver (i.e. in the receive queue). + * - (RTE_ETH_RX_DESC_UNAVAIL): Descriptor is unavailable, either hold by + * the driver and not yet returned to hw, or reserved by the hw. + * - (-EINVAL) bad descriptor offset. + * - (-ENOTSUP) if the device does not support this function. + * - (-ENODEV) bad port or queue (only if compiled with debug). + */ +static inline int +rte_eth_rx_descriptor_status(uint16_t port_id, uint16_t queue_id, + uint16_t offset) +{ + struct rte_eth_dev *dev; + void *rxq; + +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); +#endif + dev = &rte_eth_devices[port_id]; +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + if (queue_id >= dev->data->nb_rx_queues) + return -ENODEV; +#endif + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_descriptor_status, -ENOTSUP); + rxq = dev->data->rx_queues[queue_id]; + + return (*dev->dev_ops->rx_descriptor_status)(rxq, offset); +} + +#define RTE_ETH_TX_DESC_FULL 0 /**< Desc filled for hw, waiting xmit. */ +#define RTE_ETH_TX_DESC_DONE 1 /**< Desc done, packet is transmitted. */ +#define RTE_ETH_TX_DESC_UNAVAIL 2 /**< Desc used by driver or hw. */ + +/** + * Check the status of a Tx descriptor in the queue. + * + * It should be called in a similar context than the Tx function: + * - on a dataplane core + * - not concurrently on the same queue + * + * Since it's a dataplane function, no check is performed on port_id and + * queue_id. The caller must therefore ensure that the port is enabled + * and the queue is configured and running. + * + * Note: accessing to a random descriptor in the ring may trigger cache + * misses and have a performance impact. + * + * @param port_id + * A valid port identifier of the Ethernet device which. + * @param queue_id + * A valid Tx queue identifier on this port. + * @param offset + * The offset of the descriptor starting from tail (0 is the place where + * the next packet will be send). + * + * @return + * - (RTE_ETH_TX_DESC_FULL) Descriptor is being processed by the hw, i.e. + * in the transmit queue. + * - (RTE_ETH_TX_DESC_DONE) Hardware is done with this descriptor, it can + * be reused by the driver. + * - (RTE_ETH_TX_DESC_UNAVAIL): Descriptor is unavailable, reserved by the + * driver or the hardware. + * - (-EINVAL) bad descriptor offset. + * - (-ENOTSUP) if the device does not support this function. + * - (-ENODEV) bad port or queue (only if compiled with debug). + */ +static inline int rte_eth_tx_descriptor_status(uint16_t port_id, + uint16_t queue_id, uint16_t offset) +{ + struct rte_eth_dev *dev; + void *txq; + +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); +#endif + dev = &rte_eth_devices[port_id]; +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + if (queue_id >= dev->data->nb_tx_queues) + return -ENODEV; +#endif + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_descriptor_status, -ENOTSUP); + txq = dev->data->tx_queues[queue_id]; + + return (*dev->dev_ops->tx_descriptor_status)(txq, offset); +} + +/** + * Send a burst of output packets on a transmit queue of an Ethernet device. + * + * The rte_eth_tx_burst() function is invoked to transmit output packets + * on the output queue *queue_id* of the Ethernet device designated by its + * *port_id*. + * The *nb_pkts* parameter is the number of packets to send which are + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them + * allocated from a pool created with rte_pktmbuf_pool_create(). + * The rte_eth_tx_burst() function loops, sending *nb_pkts* packets, + * up to the number of transmit descriptors available in the TX ring of the + * transmit queue. + * For each packet to send, the rte_eth_tx_burst() function performs + * the following operations: + * + * - Pick up the next available descriptor in the transmit ring. + * + * - Free the network buffer previously sent with that descriptor, if any. + * + * - Initialize the transmit descriptor with the information provided + * in the *rte_mbuf data structure. + * + * In the case of a segmented packet composed of a list of *rte_mbuf* buffers, + * the rte_eth_tx_burst() function uses several transmit descriptors + * of the ring. + * + * The rte_eth_tx_burst() function returns the number of packets it + * actually sent. A return value equal to *nb_pkts* means that all packets + * have been sent, and this is likely to signify that other output packets + * could be immediately transmitted again. Applications that implement a + * "send as many packets to transmit as possible" policy can check this + * specific case and keep invoking the rte_eth_tx_burst() function until + * a value less than *nb_pkts* is returned. + * + * It is the responsibility of the rte_eth_tx_burst() function to + * transparently free the memory buffers of packets previously sent. + * This feature is driven by the *tx_free_thresh* value supplied to the + * rte_eth_dev_configure() function at device configuration time. + * When the number of free TX descriptors drops below this threshold, the + * rte_eth_tx_burst() function must [attempt to] free the *rte_mbuf* buffers + * of those packets whose transmission was effectively completed. + * + * If the PMD is DEV_TX_OFFLOAD_MT_LOCKFREE capable, multiple threads can + * invoke this function concurrently on the same tx queue without SW lock. + * @see rte_eth_dev_info_get, struct rte_eth_txconf::offloads + * + * @see rte_eth_tx_prepare to perform some prior checks or adjustments + * for offloads. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the transmit queue through which output packets must be + * sent. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param tx_pkts + * The address of an array of *nb_pkts* pointers to *rte_mbuf* structures + * which contain the output packets. + * @param nb_pkts + * The maximum number of packets to transmit. + * @return + * The number of output packets actually stored in transmit descriptors of + * the transmit ring. The return value can be less than the value of the + * *tx_pkts* parameter when the transmit ring is full or has been filled up. + */ +static inline uint16_t +rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0); + RTE_FUNC_PTR_OR_ERR_RET(*dev->tx_pkt_burst, 0); + + if (queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id); + return 0; + } +#endif + +#ifdef RTE_ETHDEV_RXTX_CALLBACKS + struct rte_eth_rxtx_callback *cb = dev->pre_tx_burst_cbs[queue_id]; + + if (unlikely(cb != NULL)) { + do { + nb_pkts = cb->fn.tx(port_id, queue_id, tx_pkts, nb_pkts, + cb->param); + cb = cb->next; + } while (cb != NULL); + } +#endif + + return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Process a burst of output packets on a transmit queue of an Ethernet device. + * + * The rte_eth_tx_prepare() function is invoked to prepare output packets to be + * transmitted on the output queue *queue_id* of the Ethernet device designated + * by its *port_id*. + * The *nb_pkts* parameter is the number of packets to be prepared which are + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them + * allocated from a pool created with rte_pktmbuf_pool_create(). + * For each packet to send, the rte_eth_tx_prepare() function performs + * the following operations: + * + * - Check if packet meets devices requirements for tx offloads. + * + * - Check limitations about number of segments. + * + * - Check additional requirements when debug is enabled. + * + * - Update and/or reset required checksums when tx offload is set for packet. + * + * Since this function can modify packet data, provided mbufs must be safely + * writable (e.g. modified data cannot be in shared segment). + * + * The rte_eth_tx_prepare() function returns the number of packets ready to be + * sent. A return value equal to *nb_pkts* means that all packets are valid and + * ready to be sent, otherwise stops processing on the first invalid packet and + * leaves the rest packets untouched. + * + * When this functionality is not implemented in the driver, all packets are + * are returned untouched. + * + * @param port_id + * The port identifier of the Ethernet device. + * The value must be a valid port id. + * @param queue_id + * The index of the transmit queue through which output packets must be + * sent. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param tx_pkts + * The address of an array of *nb_pkts* pointers to *rte_mbuf* structures + * which contain the output packets. + * @param nb_pkts + * The maximum number of packets to process. + * @return + * The number of packets correct and ready to be sent. The return value can be + * less than the value of the *tx_pkts* parameter when some packet doesn't + * meet devices requirements with rte_errno set appropriately: + * - -EINVAL: offload flags are not correctly set + * - -ENOTSUP: the offload feature is not supported by the hardware + * + */ + +#ifndef RTE_ETHDEV_TX_PREPARE_NOOP + +static inline uint16_t +rte_eth_tx_prepare(uint16_t port_id, uint16_t queue_id, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + struct rte_eth_dev *dev; + +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + if (!rte_eth_dev_is_valid_port(port_id)) { + RTE_ETHDEV_LOG(ERR, "Invalid TX port_id=%u\n", port_id); + rte_errno = -EINVAL; + return 0; + } +#endif + + dev = &rte_eth_devices[port_id]; + +#ifdef RTE_LIBRTE_ETHDEV_DEBUG + if (queue_id >= dev->data->nb_tx_queues) { + RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id); + rte_errno = -EINVAL; + return 0; + } +#endif + + if (!dev->tx_pkt_prepare) + return nb_pkts; + + return (*dev->tx_pkt_prepare)(dev->data->tx_queues[queue_id], + tx_pkts, nb_pkts); +} + +#else + +/* + * Native NOOP operation for compilation targets which doesn't require any + * preparations steps, and functional NOOP may introduce unnecessary performance + * drop. + * + * Generally this is not a good idea to turn it on globally and didn't should + * be used if behavior of tx_preparation can change. + */ + +static inline uint16_t +rte_eth_tx_prepare(__rte_unused uint16_t port_id, + __rte_unused uint16_t queue_id, + __rte_unused struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + return nb_pkts; +} + +#endif + +/** + * Send any packets queued up for transmission on a port and HW queue + * + * This causes an explicit flush of packets previously buffered via the + * rte_eth_tx_buffer() function. It returns the number of packets successfully + * sent to the NIC, and calls the error callback for any unsent packets. Unless + * explicitly set up otherwise, the default callback simply frees the unsent + * packets back to the owning mempool. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the transmit queue through which output packets must be + * sent. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param buffer + * Buffer of packets to be transmit. + * @return + * The number of packets successfully sent to the Ethernet device. The error + * callback is called for any packets which could not be sent. + */ +static inline uint16_t +rte_eth_tx_buffer_flush(uint16_t port_id, uint16_t queue_id, + struct rte_eth_dev_tx_buffer *buffer) +{ + uint16_t sent; + uint16_t to_send = buffer->length; + + if (to_send == 0) + return 0; + + sent = rte_eth_tx_burst(port_id, queue_id, buffer->pkts, to_send); + + buffer->length = 0; + + /* All packets sent, or to be dealt with by callback below */ + if (unlikely(sent != to_send)) + buffer->error_callback(&buffer->pkts[sent], + (uint16_t)(to_send - sent), + buffer->error_userdata); + + return sent; +} + +/** + * Buffer a single packet for future transmission on a port and queue + * + * This function takes a single mbuf/packet and buffers it for later + * transmission on the particular port and queue specified. Once the buffer is + * full of packets, an attempt will be made to transmit all the buffered + * packets. In case of error, where not all packets can be transmitted, a + * callback is called with the unsent packets as a parameter. If no callback + * is explicitly set up, the unsent packets are just freed back to the owning + * mempool. The function returns the number of packets actually sent i.e. + * 0 if no buffer flush occurred, otherwise the number of packets successfully + * flushed + * + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The index of the transmit queue through which output packets must be + * sent. + * The value must be in the range [0, nb_tx_queue - 1] previously supplied + * to rte_eth_dev_configure(). + * @param buffer + * Buffer used to collect packets to be sent. + * @param tx_pkt + * Pointer to the packet mbuf to be sent. + * @return + * 0 = packet has been buffered for later transmission + * N > 0 = packet has been buffered, and the buffer was subsequently flushed, + * causing N packets to be sent, and the error callback to be called for + * the rest. + */ +static __rte_always_inline uint16_t +rte_eth_tx_buffer(uint16_t port_id, uint16_t queue_id, + struct rte_eth_dev_tx_buffer *buffer, struct rte_mbuf *tx_pkt) +{ + buffer->pkts[buffer->length++] = tx_pkt; + if (buffer->length < buffer->size) + return 0; + + return rte_eth_tx_buffer_flush(port_id, queue_id, buffer); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ETHDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_core.h b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_core.h new file mode 100644 index 00000000..33d12b3a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_core.h @@ -0,0 +1,625 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_ETHDEV_CORE_H_ +#define _RTE_ETHDEV_CORE_H_ + +/** + * @file + * + * RTE Ethernet Device internal header. + * + * This header contains internal data types. But they are still part of the + * public API because they are used by inline functions in the published API. + * + * Applications should not use these directly. + * + */ + +struct rte_eth_dev_callback; +/** @internal Structure to keep track of registered callbacks */ +TAILQ_HEAD(rte_eth_dev_cb_list, rte_eth_dev_callback); + +/* + * Definitions of all functions exported by an Ethernet driver through the + * the generic structure of type *eth_dev_ops* supplied in the *rte_eth_dev* + * structure associated with an Ethernet device. + */ +struct rte_eth_dev; + +typedef int (*eth_dev_configure_t)(struct rte_eth_dev *dev); +/**< @internal Ethernet device configuration. */ + +typedef int (*eth_dev_start_t)(struct rte_eth_dev *dev); +/**< @internal Function used to start a configured Ethernet device. */ + +typedef void (*eth_dev_stop_t)(struct rte_eth_dev *dev); +/**< @internal Function used to stop a configured Ethernet device. */ + +typedef int (*eth_dev_set_link_up_t)(struct rte_eth_dev *dev); +/**< @internal Function used to link up a configured Ethernet device. */ + +typedef int (*eth_dev_set_link_down_t)(struct rte_eth_dev *dev); +/**< @internal Function used to link down a configured Ethernet device. */ + +typedef void (*eth_dev_close_t)(struct rte_eth_dev *dev); +/**< @internal Function used to close a configured Ethernet device. */ + +typedef int (*eth_dev_reset_t)(struct rte_eth_dev *dev); +/** <@internal Function used to reset a configured Ethernet device. */ + +typedef int (*eth_is_removed_t)(struct rte_eth_dev *dev); +/**< @internal Function used to detect an Ethernet device removal. */ + +typedef void (*eth_promiscuous_enable_t)(struct rte_eth_dev *dev); +/**< @internal Function used to enable the RX promiscuous mode of an Ethernet device. */ + +typedef void (*eth_promiscuous_disable_t)(struct rte_eth_dev *dev); +/**< @internal Function used to disable the RX promiscuous mode of an Ethernet device. */ + +typedef void (*eth_allmulticast_enable_t)(struct rte_eth_dev *dev); +/**< @internal Enable the receipt of all multicast packets by an Ethernet device. */ + +typedef void (*eth_allmulticast_disable_t)(struct rte_eth_dev *dev); +/**< @internal Disable the receipt of all multicast packets by an Ethernet device. */ + +typedef int (*eth_link_update_t)(struct rte_eth_dev *dev, + int wait_to_complete); +/**< @internal Get link speed, duplex mode and state (up/down) of an Ethernet device. */ + +typedef int (*eth_stats_get_t)(struct rte_eth_dev *dev, + struct rte_eth_stats *igb_stats); +/**< @internal Get global I/O statistics of an Ethernet device. */ + +typedef void (*eth_stats_reset_t)(struct rte_eth_dev *dev); +/**< @internal Reset global I/O statistics of an Ethernet device to 0. */ + +typedef int (*eth_xstats_get_t)(struct rte_eth_dev *dev, + struct rte_eth_xstat *stats, unsigned n); +/**< @internal Get extended stats of an Ethernet device. */ + +typedef int (*eth_xstats_get_by_id_t)(struct rte_eth_dev *dev, + const uint64_t *ids, + uint64_t *values, + unsigned int n); +/**< @internal Get extended stats of an Ethernet device. */ + +typedef void (*eth_xstats_reset_t)(struct rte_eth_dev *dev); +/**< @internal Reset extended stats of an Ethernet device. */ + +typedef int (*eth_xstats_get_names_t)(struct rte_eth_dev *dev, + struct rte_eth_xstat_name *xstats_names, unsigned size); +/**< @internal Get names of extended stats of an Ethernet device. */ + +typedef int (*eth_xstats_get_names_by_id_t)(struct rte_eth_dev *dev, + struct rte_eth_xstat_name *xstats_names, const uint64_t *ids, + unsigned int size); +/**< @internal Get names of extended stats of an Ethernet device. */ + +typedef int (*eth_queue_stats_mapping_set_t)(struct rte_eth_dev *dev, + uint16_t queue_id, + uint8_t stat_idx, + uint8_t is_rx); +/**< @internal Set a queue statistics mapping for a tx/rx queue of an Ethernet device. */ + +typedef void (*eth_dev_infos_get_t)(struct rte_eth_dev *dev, + struct rte_eth_dev_info *dev_info); +/**< @internal Get specific informations of an Ethernet device. */ + +typedef const uint32_t *(*eth_dev_supported_ptypes_get_t)(struct rte_eth_dev *dev); +/**< @internal Get supported ptypes of an Ethernet device. */ + +typedef int (*eth_queue_start_t)(struct rte_eth_dev *dev, + uint16_t queue_id); +/**< @internal Start rx and tx of a queue of an Ethernet device. */ + +typedef int (*eth_queue_stop_t)(struct rte_eth_dev *dev, + uint16_t queue_id); +/**< @internal Stop rx and tx of a queue of an Ethernet device. */ + +typedef int (*eth_rx_queue_setup_t)(struct rte_eth_dev *dev, + uint16_t rx_queue_id, + uint16_t nb_rx_desc, + unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mb_pool); +/**< @internal Set up a receive queue of an Ethernet device. */ + +typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev, + uint16_t tx_queue_id, + uint16_t nb_tx_desc, + unsigned int socket_id, + const struct rte_eth_txconf *tx_conf); +/**< @internal Setup a transmit queue of an Ethernet device. */ + +typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev, + uint16_t rx_queue_id); +/**< @internal Enable interrupt of a receive queue of an Ethernet device. */ + +typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev, + uint16_t rx_queue_id); +/**< @internal Disable interrupt of a receive queue of an Ethernet device. */ + +typedef void (*eth_queue_release_t)(void *queue); +/**< @internal Release memory resources allocated by given RX/TX queue. */ + +typedef uint32_t (*eth_rx_queue_count_t)(struct rte_eth_dev *dev, + uint16_t rx_queue_id); +/**< @internal Get number of used descriptors on a receive queue. */ + +typedef int (*eth_rx_descriptor_done_t)(void *rxq, uint16_t offset); +/**< @internal Check DD bit of specific RX descriptor */ + +typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset); +/**< @internal Check the status of a Rx descriptor */ + +typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset); +/**< @internal Check the status of a Tx descriptor */ + +typedef int (*eth_fw_version_get_t)(struct rte_eth_dev *dev, + char *fw_version, size_t fw_size); +/**< @internal Get firmware information of an Ethernet device. */ + +typedef int (*eth_tx_done_cleanup_t)(void *txq, uint32_t free_cnt); +/**< @internal Force mbufs to be from TX ring. */ + +typedef void (*eth_rxq_info_get_t)(struct rte_eth_dev *dev, + uint16_t rx_queue_id, struct rte_eth_rxq_info *qinfo); + +typedef void (*eth_txq_info_get_t)(struct rte_eth_dev *dev, + uint16_t tx_queue_id, struct rte_eth_txq_info *qinfo); + +typedef int (*mtu_set_t)(struct rte_eth_dev *dev, uint16_t mtu); +/**< @internal Set MTU. */ + +typedef int (*vlan_filter_set_t)(struct rte_eth_dev *dev, + uint16_t vlan_id, + int on); +/**< @internal filtering of a VLAN Tag Identifier by an Ethernet device. */ + +typedef int (*vlan_tpid_set_t)(struct rte_eth_dev *dev, + enum rte_vlan_type type, uint16_t tpid); +/**< @internal set the outer/inner VLAN-TPID by an Ethernet device. */ + +typedef int (*vlan_offload_set_t)(struct rte_eth_dev *dev, int mask); +/**< @internal set VLAN offload function by an Ethernet device. */ + +typedef int (*vlan_pvid_set_t)(struct rte_eth_dev *dev, + uint16_t vlan_id, + int on); +/**< @internal set port based TX VLAN insertion by an Ethernet device. */ + +typedef void (*vlan_strip_queue_set_t)(struct rte_eth_dev *dev, + uint16_t rx_queue_id, + int on); +/**< @internal VLAN stripping enable/disable by an queue of Ethernet device. */ + +typedef uint16_t (*eth_rx_burst_t)(void *rxq, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +/**< @internal Retrieve input packets from a receive queue of an Ethernet device. */ + +typedef uint16_t (*eth_tx_burst_t)(void *txq, + struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); +/**< @internal Send output packets on a transmit queue of an Ethernet device. */ + +typedef uint16_t (*eth_tx_prep_t)(void *txq, + struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); +/**< @internal Prepare output packets on a transmit queue of an Ethernet device. */ + +typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +/**< @internal Get current flow control parameter on an Ethernet device */ + +typedef int (*flow_ctrl_set_t)(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +/**< @internal Setup flow control parameter on an Ethernet device */ + +typedef int (*priority_flow_ctrl_set_t)(struct rte_eth_dev *dev, + struct rte_eth_pfc_conf *pfc_conf); +/**< @internal Setup priority flow control parameter on an Ethernet device */ + +typedef int (*reta_update_t)(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); +/**< @internal Update RSS redirection table on an Ethernet device */ + +typedef int (*reta_query_t)(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); +/**< @internal Query RSS redirection table on an Ethernet device */ + +typedef int (*rss_hash_update_t)(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +/**< @internal Update RSS hash configuration of an Ethernet device */ + +typedef int (*rss_hash_conf_get_t)(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +/**< @internal Get current RSS hash configuration of an Ethernet device */ + +typedef int (*eth_dev_led_on_t)(struct rte_eth_dev *dev); +/**< @internal Turn on SW controllable LED on an Ethernet device */ + +typedef int (*eth_dev_led_off_t)(struct rte_eth_dev *dev); +/**< @internal Turn off SW controllable LED on an Ethernet device */ + +typedef void (*eth_mac_addr_remove_t)(struct rte_eth_dev *dev, uint32_t index); +/**< @internal Remove MAC address from receive address register */ + +typedef int (*eth_mac_addr_add_t)(struct rte_eth_dev *dev, + struct ether_addr *mac_addr, + uint32_t index, + uint32_t vmdq); +/**< @internal Set a MAC address into Receive Address Address Register */ + +typedef int (*eth_mac_addr_set_t)(struct rte_eth_dev *dev, + struct ether_addr *mac_addr); +/**< @internal Set a MAC address into Receive Address Address Register */ + +typedef int (*eth_uc_hash_table_set_t)(struct rte_eth_dev *dev, + struct ether_addr *mac_addr, + uint8_t on); +/**< @internal Set a Unicast Hash bitmap */ + +typedef int (*eth_uc_all_hash_table_set_t)(struct rte_eth_dev *dev, + uint8_t on); +/**< @internal Set all Unicast Hash bitmap */ + +typedef int (*eth_set_queue_rate_limit_t)(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t tx_rate); +/**< @internal Set queue TX rate */ + +typedef int (*eth_mirror_rule_set_t)(struct rte_eth_dev *dev, + struct rte_eth_mirror_conf *mirror_conf, + uint8_t rule_id, + uint8_t on); +/**< @internal Add a traffic mirroring rule on an Ethernet device */ + +typedef int (*eth_mirror_rule_reset_t)(struct rte_eth_dev *dev, + uint8_t rule_id); +/**< @internal Remove a traffic mirroring rule on an Ethernet device */ + +typedef int (*eth_udp_tunnel_port_add_t)(struct rte_eth_dev *dev, + struct rte_eth_udp_tunnel *tunnel_udp); +/**< @internal Add tunneling UDP port */ + +typedef int (*eth_udp_tunnel_port_del_t)(struct rte_eth_dev *dev, + struct rte_eth_udp_tunnel *tunnel_udp); +/**< @internal Delete tunneling UDP port */ + +typedef int (*eth_set_mc_addr_list_t)(struct rte_eth_dev *dev, + struct ether_addr *mc_addr_set, + uint32_t nb_mc_addr); +/**< @internal set the list of multicast addresses on an Ethernet device */ + +typedef int (*eth_timesync_enable_t)(struct rte_eth_dev *dev); +/**< @internal Function used to enable IEEE1588/802.1AS timestamping. */ + +typedef int (*eth_timesync_disable_t)(struct rte_eth_dev *dev); +/**< @internal Function used to disable IEEE1588/802.1AS timestamping. */ + +typedef int (*eth_timesync_read_rx_timestamp_t)(struct rte_eth_dev *dev, + struct timespec *timestamp, + uint32_t flags); +/**< @internal Function used to read an RX IEEE1588/802.1AS timestamp. */ + +typedef int (*eth_timesync_read_tx_timestamp_t)(struct rte_eth_dev *dev, + struct timespec *timestamp); +/**< @internal Function used to read a TX IEEE1588/802.1AS timestamp. */ + +typedef int (*eth_timesync_adjust_time)(struct rte_eth_dev *dev, int64_t); +/**< @internal Function used to adjust the device clock */ + +typedef int (*eth_timesync_read_time)(struct rte_eth_dev *dev, + struct timespec *timestamp); +/**< @internal Function used to get time from the device clock. */ + +typedef int (*eth_timesync_write_time)(struct rte_eth_dev *dev, + const struct timespec *timestamp); +/**< @internal Function used to get time from the device clock */ + +typedef int (*eth_get_reg_t)(struct rte_eth_dev *dev, + struct rte_dev_reg_info *info); +/**< @internal Retrieve registers */ + +typedef int (*eth_get_eeprom_length_t)(struct rte_eth_dev *dev); +/**< @internal Retrieve eeprom size */ + +typedef int (*eth_get_eeprom_t)(struct rte_eth_dev *dev, + struct rte_dev_eeprom_info *info); +/**< @internal Retrieve eeprom data */ + +typedef int (*eth_set_eeprom_t)(struct rte_eth_dev *dev, + struct rte_dev_eeprom_info *info); +/**< @internal Program eeprom data */ + +typedef int (*eth_get_module_info_t)(struct rte_eth_dev *dev, + struct rte_eth_dev_module_info *modinfo); +/**< @internal Retrieve type and size of plugin module eeprom */ + +typedef int (*eth_get_module_eeprom_t)(struct rte_eth_dev *dev, + struct rte_dev_eeprom_info *info); +/**< @internal Retrieve plugin module eeprom data */ + +typedef int (*eth_l2_tunnel_eth_type_conf_t) + (struct rte_eth_dev *dev, struct rte_eth_l2_tunnel_conf *l2_tunnel); +/**< @internal config l2 tunnel ether type */ + +typedef int (*eth_l2_tunnel_offload_set_t) + (struct rte_eth_dev *dev, + struct rte_eth_l2_tunnel_conf *l2_tunnel, + uint32_t mask, + uint8_t en); +/**< @internal enable/disable the l2 tunnel offload functions */ + + +typedef int (*eth_filter_ctrl_t)(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); +/**< @internal Take operations to assigned filter type on an Ethernet device */ + +typedef int (*eth_tm_ops_get_t)(struct rte_eth_dev *dev, void *ops); +/**< @internal Get Traffic Management (TM) operations on an Ethernet device */ + +typedef int (*eth_mtr_ops_get_t)(struct rte_eth_dev *dev, void *ops); +/**< @internal Get Trafffic Metering and Policing (MTR) operations */ + +typedef int (*eth_get_dcb_info)(struct rte_eth_dev *dev, + struct rte_eth_dcb_info *dcb_info); +/**< @internal Get dcb information on an Ethernet device */ + +typedef int (*eth_pool_ops_supported_t)(struct rte_eth_dev *dev, + const char *pool); +/**< @internal Test if a port supports specific mempool ops */ + +/** + * @internal A structure containing the functions exported by an Ethernet driver. + */ +struct eth_dev_ops { + eth_dev_configure_t dev_configure; /**< Configure device. */ + eth_dev_start_t dev_start; /**< Start device. */ + eth_dev_stop_t dev_stop; /**< Stop device. */ + eth_dev_set_link_up_t dev_set_link_up; /**< Device link up. */ + eth_dev_set_link_down_t dev_set_link_down; /**< Device link down. */ + eth_dev_close_t dev_close; /**< Close device. */ + eth_dev_reset_t dev_reset; /**< Reset device. */ + eth_link_update_t link_update; /**< Get device link state. */ + eth_is_removed_t is_removed; + /**< Check if the device was physically removed. */ + + eth_promiscuous_enable_t promiscuous_enable; /**< Promiscuous ON. */ + eth_promiscuous_disable_t promiscuous_disable;/**< Promiscuous OFF. */ + eth_allmulticast_enable_t allmulticast_enable;/**< RX multicast ON. */ + eth_allmulticast_disable_t allmulticast_disable;/**< RX multicast OFF. */ + eth_mac_addr_remove_t mac_addr_remove; /**< Remove MAC address. */ + eth_mac_addr_add_t mac_addr_add; /**< Add a MAC address. */ + eth_mac_addr_set_t mac_addr_set; /**< Set a MAC address. */ + eth_set_mc_addr_list_t set_mc_addr_list; /**< set list of mcast addrs. */ + mtu_set_t mtu_set; /**< Set MTU. */ + + eth_stats_get_t stats_get; /**< Get generic device statistics. */ + eth_stats_reset_t stats_reset; /**< Reset generic device statistics. */ + eth_xstats_get_t xstats_get; /**< Get extended device statistics. */ + eth_xstats_reset_t xstats_reset; /**< Reset extended device statistics. */ + eth_xstats_get_names_t xstats_get_names; + /**< Get names of extended statistics. */ + eth_queue_stats_mapping_set_t queue_stats_mapping_set; + /**< Configure per queue stat counter mapping. */ + + eth_dev_infos_get_t dev_infos_get; /**< Get device info. */ + eth_rxq_info_get_t rxq_info_get; /**< retrieve RX queue information. */ + eth_txq_info_get_t txq_info_get; /**< retrieve TX queue information. */ + eth_fw_version_get_t fw_version_get; /**< Get firmware version. */ + eth_dev_supported_ptypes_get_t dev_supported_ptypes_get; + /**< Get packet types supported and identified by device. */ + + vlan_filter_set_t vlan_filter_set; /**< Filter VLAN Setup. */ + vlan_tpid_set_t vlan_tpid_set; /**< Outer/Inner VLAN TPID Setup. */ + vlan_strip_queue_set_t vlan_strip_queue_set; /**< VLAN Stripping on queue. */ + vlan_offload_set_t vlan_offload_set; /**< Set VLAN Offload. */ + vlan_pvid_set_t vlan_pvid_set; /**< Set port based TX VLAN insertion. */ + + eth_queue_start_t rx_queue_start;/**< Start RX for a queue. */ + eth_queue_stop_t rx_queue_stop; /**< Stop RX for a queue. */ + eth_queue_start_t tx_queue_start;/**< Start TX for a queue. */ + eth_queue_stop_t tx_queue_stop; /**< Stop TX for a queue. */ + eth_rx_queue_setup_t rx_queue_setup;/**< Set up device RX queue. */ + eth_queue_release_t rx_queue_release; /**< Release RX queue. */ + eth_rx_queue_count_t rx_queue_count; + /**< Get the number of used RX descriptors. */ + eth_rx_descriptor_done_t rx_descriptor_done; /**< Check rxd DD bit. */ + eth_rx_descriptor_status_t rx_descriptor_status; + /**< Check the status of a Rx descriptor. */ + eth_tx_descriptor_status_t tx_descriptor_status; + /**< Check the status of a Tx descriptor. */ + eth_rx_enable_intr_t rx_queue_intr_enable; /**< Enable Rx queue interrupt. */ + eth_rx_disable_intr_t rx_queue_intr_disable; /**< Disable Rx queue interrupt. */ + eth_tx_queue_setup_t tx_queue_setup;/**< Set up device TX queue. */ + eth_queue_release_t tx_queue_release; /**< Release TX queue. */ + eth_tx_done_cleanup_t tx_done_cleanup;/**< Free tx ring mbufs */ + + eth_dev_led_on_t dev_led_on; /**< Turn on LED. */ + eth_dev_led_off_t dev_led_off; /**< Turn off LED. */ + + flow_ctrl_get_t flow_ctrl_get; /**< Get flow control. */ + flow_ctrl_set_t flow_ctrl_set; /**< Setup flow control. */ + priority_flow_ctrl_set_t priority_flow_ctrl_set; /**< Setup priority flow control. */ + + eth_uc_hash_table_set_t uc_hash_table_set; /**< Set Unicast Table Array. */ + eth_uc_all_hash_table_set_t uc_all_hash_table_set; /**< Set Unicast hash bitmap. */ + + eth_mirror_rule_set_t mirror_rule_set; /**< Add a traffic mirror rule. */ + eth_mirror_rule_reset_t mirror_rule_reset; /**< reset a traffic mirror rule. */ + + eth_udp_tunnel_port_add_t udp_tunnel_port_add; /** Add UDP tunnel port. */ + eth_udp_tunnel_port_del_t udp_tunnel_port_del; /** Del UDP tunnel port. */ + eth_l2_tunnel_eth_type_conf_t l2_tunnel_eth_type_conf; + /** Config ether type of l2 tunnel. */ + eth_l2_tunnel_offload_set_t l2_tunnel_offload_set; + /** Enable/disable l2 tunnel offload functions. */ + + eth_set_queue_rate_limit_t set_queue_rate_limit; /**< Set queue rate limit. */ + + rss_hash_update_t rss_hash_update; /** Configure RSS hash protocols. */ + rss_hash_conf_get_t rss_hash_conf_get; /** Get current RSS hash configuration. */ + reta_update_t reta_update; /** Update redirection table. */ + reta_query_t reta_query; /** Query redirection table. */ + + eth_get_reg_t get_reg; /**< Get registers. */ + eth_get_eeprom_length_t get_eeprom_length; /**< Get eeprom length. */ + eth_get_eeprom_t get_eeprom; /**< Get eeprom data. */ + eth_set_eeprom_t set_eeprom; /**< Set eeprom. */ + + eth_get_module_info_t get_module_info; + /** Get plugin module eeprom attribute. */ + eth_get_module_eeprom_t get_module_eeprom; + /** Get plugin module eeprom data. */ + + eth_filter_ctrl_t filter_ctrl; /**< common filter control. */ + + eth_get_dcb_info get_dcb_info; /** Get DCB information. */ + + eth_timesync_enable_t timesync_enable; + /** Turn IEEE1588/802.1AS timestamping on. */ + eth_timesync_disable_t timesync_disable; + /** Turn IEEE1588/802.1AS timestamping off. */ + eth_timesync_read_rx_timestamp_t timesync_read_rx_timestamp; + /** Read the IEEE1588/802.1AS RX timestamp. */ + eth_timesync_read_tx_timestamp_t timesync_read_tx_timestamp; + /** Read the IEEE1588/802.1AS TX timestamp. */ + eth_timesync_adjust_time timesync_adjust_time; /** Adjust the device clock. */ + eth_timesync_read_time timesync_read_time; /** Get the device clock time. */ + eth_timesync_write_time timesync_write_time; /** Set the device clock time. */ + + eth_xstats_get_by_id_t xstats_get_by_id; + /**< Get extended device statistic values by ID. */ + eth_xstats_get_names_by_id_t xstats_get_names_by_id; + /**< Get name of extended device statistics by ID. */ + + eth_tm_ops_get_t tm_ops_get; + /**< Get Traffic Management (TM) operations. */ + + eth_mtr_ops_get_t mtr_ops_get; + /**< Get Traffic Metering and Policing (MTR) operations. */ + + eth_pool_ops_supported_t pool_ops_supported; + /**< Test if a port supports specific mempool ops */ +}; + +/** + * @internal + * Structure used to hold information about the callbacks to be called for a + * queue on RX and TX. + */ +struct rte_eth_rxtx_callback { + struct rte_eth_rxtx_callback *next; + union{ + rte_rx_callback_fn rx; + rte_tx_callback_fn tx; + } fn; + void *param; +}; + +/** + * @internal + * The generic data structure associated with each ethernet device. + * + * Pointers to burst-oriented packet receive and transmit functions are + * located at the beginning of the structure, along with the pointer to + * where all the data elements for the particular device are stored in shared + * memory. This split allows the function pointer and driver data to be per- + * process, while the actual configuration data for the device is shared. + */ +struct rte_eth_dev { + eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */ + eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */ + eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. */ + struct rte_eth_dev_data *data; /**< Pointer to device data */ + const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */ + struct rte_device *device; /**< Backing device */ + struct rte_intr_handle *intr_handle; /**< Device interrupt handle */ + /** User application callbacks for NIC interrupts */ + struct rte_eth_dev_cb_list link_intr_cbs; + /** + * User-supplied functions called from rx_burst to post-process + * received packets before passing them to the user + */ + struct rte_eth_rxtx_callback *post_rx_burst_cbs[RTE_MAX_QUEUES_PER_PORT]; + /** + * User-supplied functions called from tx_burst to pre-process + * received packets before passing them to the driver for transmission. + */ + struct rte_eth_rxtx_callback *pre_tx_burst_cbs[RTE_MAX_QUEUES_PER_PORT]; + enum rte_eth_dev_state state; /**< Flag indicating the port state */ + void *security_ctx; /**< Context for security ops */ +} __rte_cache_aligned; + +struct rte_eth_dev_sriov; +struct rte_eth_dev_owner; + +/** + * @internal + * The data part, with no function pointers, associated with each ethernet device. + * + * This structure is safe to place in shared memory to be common among different + * processes in a multi-process configuration. + */ +struct rte_eth_dev_data { + char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */ + + void **rx_queues; /**< Array of pointers to RX queues. */ + void **tx_queues; /**< Array of pointers to TX queues. */ + uint16_t nb_rx_queues; /**< Number of RX queues. */ + uint16_t nb_tx_queues; /**< Number of TX queues. */ + + struct rte_eth_dev_sriov sriov; /**< SRIOV data */ + + void *dev_private; /**< PMD-specific private data */ + + struct rte_eth_link dev_link; + /**< Link-level information & status */ + + struct rte_eth_conf dev_conf; /**< Configuration applied to device. */ + uint16_t mtu; /**< Maximum Transmission Unit. */ + + uint32_t min_rx_buf_size; + /**< Common rx buffer size handled by all queues */ + + uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */ + struct ether_addr* mac_addrs;/**< Device Ethernet Link address. */ + uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR]; + /** bitmap array of associating Ethernet MAC addresses to pools */ + struct ether_addr* hash_mac_addrs; + /** Device Ethernet MAC addresses of hash filtering. */ + uint16_t port_id; /**< Device [external] port identifier. */ + __extension__ + uint8_t promiscuous : 1, /**< RX promiscuous mode ON(1) / OFF(0). */ + scattered_rx : 1, /**< RX of scattered packets is ON(1) / OFF(0) */ + all_multicast : 1, /**< RX all multicast mode ON(1) / OFF(0). */ + dev_started : 1, /**< Device state: STARTED(1) / STOPPED(0). */ + lro : 1; /**< RX LRO is ON(1) / OFF(0) */ + uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT]; + /** Queues state: STARTED(1) / STOPPED(0) */ + uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT]; + /** Queues state: STARTED(1) / STOPPED(0) */ + uint32_t dev_flags; /**< Capabilities */ + enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */ + int numa_node; /**< NUMA node connection */ + struct rte_vlan_filter_conf vlan_filter_conf; + /**< VLAN filter configuration. */ + struct rte_eth_dev_owner owner; /**< The port owner. */ +} __rte_cache_aligned; + +/** + * @internal + * The pool of *rte_eth_dev* structures. The size of the pool + * is configured at compile-time in the file. + */ +extern struct rte_eth_dev rte_eth_devices[]; + +#endif /* _RTE_ETHDEV_CORE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_driver.h b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_driver.h new file mode 100644 index 00000000..c6d9bc1a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_driver.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_ETHDEV_DRIVER_H_ +#define _RTE_ETHDEV_DRIVER_H_ + +/** + * @file + * + * RTE Ethernet Device PMD API + * + * These APIs for the use from Ethernet drivers, user applications shouldn't + * use them. + * + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @internal + * Returns a ethdev slot specified by the unique identifier name. + * + * @param name + * The pointer to the Unique identifier name for each Ethernet device + * @return + * - The pointer to the ethdev slot, on success. NULL on error + */ +struct rte_eth_dev *rte_eth_dev_allocated(const char *name); + +/** + * @internal + * Allocates a new ethdev slot for an ethernet device and returns the pointer + * to that slot for the driver to use. + * + * @param name Unique identifier name for each Ethernet device + * @return + * - Slot in the rte_dev_devices array for a new device; + */ +struct rte_eth_dev *rte_eth_dev_allocate(const char *name); + +/** + * @internal + * Attach to the ethdev already initialized by the primary + * process. + * + * @param name Ethernet device's name. + * @return + * - Success: Slot in the rte_dev_devices array for attached + * device. + * - Error: Null pointer. + */ +struct rte_eth_dev *rte_eth_dev_attach_secondary(const char *name); + +/** + * @internal + * Release the specified ethdev port. + * + * @param eth_dev + * The *eth_dev* pointer is the address of the *rte_eth_dev* structure. + * @return + * - 0 on success, negative on error + */ +int rte_eth_dev_release_port(struct rte_eth_dev *eth_dev); + +/** + * @internal + * Release device queues and clear its configuration to force the user + * application to reconfigure it. It is for internal use only. + * + * @param dev + * Pointer to struct rte_eth_dev. + * + * @return + * void + */ +void _rte_eth_dev_reset(struct rte_eth_dev *dev); + +/** + * @internal Executes all the user application registered callbacks for + * the specific device. It is for DPDK internal user only. User + * application should not call it directly. + * + * @param dev + * Pointer to struct rte_eth_dev. + * @param event + * Eth device interrupt event type. + * @param ret_param + * To pass data back to user application. + * This allows the user application to decide if a particular function + * is permitted or not. + * + * @return + * int + */ +int _rte_eth_dev_callback_process(struct rte_eth_dev *dev, + enum rte_eth_event_type event, void *ret_param); + +/** + * @internal + * This is the last step of device probing. + * It must be called after a port is allocated and initialized successfully. + * + * The notification RTE_ETH_EVENT_NEW is sent to other entities + * (libraries and applications). + * The state is set as RTE_ETH_DEV_ATTACHED. + * + * @param dev + * New ethdev port. + */ +void rte_eth_dev_probing_finish(struct rte_eth_dev *dev); + +/** + * Create memzone for HW rings. + * malloc can't be used as the physical address is needed. + * If the memzone is already created, then this function returns a ptr + * to the old one. + * + * @param eth_dev + * The *eth_dev* pointer is the address of the *rte_eth_dev* structure + * @param name + * The name of the memory zone + * @param queue_id + * The index of the queue to add to name + * @param size + * The sizeof of the memory area + * @param align + * Alignment for resulting memzone. Must be a power of 2. + * @param socket_id + * The *socket_id* argument is the socket identifier in case of NUMA. + */ +const struct rte_memzone * +rte_eth_dma_zone_reserve(const struct rte_eth_dev *eth_dev, const char *name, + uint16_t queue_id, size_t size, + unsigned align, int socket_id); + +/** + * @internal + * Atomically set the link status for the specific device. + * It is for use by DPDK device driver use only. + * User applications should not call it + * + * @param dev + * Pointer to struct rte_eth_dev. + * @param link + * New link status value. + * @return + * Same convention as eth_link_update operation. + * 0 if link up status has changed + * -1 if link up status was unchanged + */ +static inline int +rte_eth_linkstatus_set(struct rte_eth_dev *dev, + const struct rte_eth_link *new_link) +{ + volatile uint64_t *dev_link + = (volatile uint64_t *)&(dev->data->dev_link); + union { + uint64_t val64; + struct rte_eth_link link; + } orig; + + RTE_BUILD_BUG_ON(sizeof(*new_link) != sizeof(uint64_t)); + + orig.val64 = rte_atomic64_exchange(dev_link, + *(const uint64_t *)new_link); + + return (orig.link.link_status == new_link->link_status) ? -1 : 0; +} + +/** + * @internal + * Atomically get the link speed and status. + * + * @param dev + * Pointer to struct rte_eth_dev. + * @param link + * link status value. + */ +static inline void +rte_eth_linkstatus_get(const struct rte_eth_dev *dev, + struct rte_eth_link *link) +{ + volatile uint64_t *src = (uint64_t *)&(dev->data->dev_link); + uint64_t *dst = (uint64_t *)link; + + RTE_BUILD_BUG_ON(sizeof(*link) != sizeof(uint64_t)); + +#ifdef __LP64__ + /* if cpu arch has 64 bit unsigned lon then implicitly atomic */ + *dst = *src; +#else + /* can't use rte_atomic64_read because it returns signed int */ + do { + *dst = *src; + } while (!rte_atomic64_cmpset(src, *dst, *dst)); +#endif +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Allocate an unique switch domain identifier. + * + * A pool of switch domain identifiers which can be allocated on request. This + * will enabled devices which support the concept of switch domains to request + * a switch domain id which is guaranteed to be unique from other devices + * running in the same process. + * + * @param domain_id + * switch domain identifier parameter to pass back to application + * + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental +rte_eth_switch_domain_alloc(uint16_t *domain_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Free switch domain. + * + * Return a switch domain identifier to the pool of free identifiers after it is + * no longer in use by device. + * + * @param domain_id + * switch domain identifier to free + * + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental +rte_eth_switch_domain_free(uint16_t domain_id); + +/** Generic Ethernet device arguments */ +struct rte_eth_devargs { + uint16_t ports[RTE_MAX_ETHPORTS]; + /** port/s number to enable on a multi-port single function */ + uint16_t nb_ports; + /** number of ports in ports field */ + uint16_t representor_ports[RTE_MAX_ETHPORTS]; + /** representor port/s identifier to enable on device */ + uint16_t nb_representor_ports; + /** number of ports in representor port field */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * PMD helper function to parse ethdev arguments + * + * @param devargs + * device arguments + * @param eth_devargs + * parsed ethdev specific arguments. + * + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental +rte_eth_devargs_parse(const char *devargs, struct rte_eth_devargs *eth_devargs); + + +typedef int (*ethdev_init_t)(struct rte_eth_dev *ethdev, void *init_params); +typedef int (*ethdev_bus_specific_init)(struct rte_eth_dev *ethdev, + void *bus_specific_init_params); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * PMD helper function for the creation of a new ethdev ports. + * + * @param device + * rte_device handle. + * @param name + * port name. + * @param priv_data_size + * size of private data required for port. + * @param bus_specific_init + * port bus specific initialisation callback function + * @param bus_init_params + * port bus specific initialisation parameters + * @param ethdev_init + * device specific port initialization callback function + * @param init_params + * port initialisation parameters + * + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental +rte_eth_dev_create(struct rte_device *device, const char *name, + size_t priv_data_size, + ethdev_bus_specific_init bus_specific_init, void *bus_init_params, + ethdev_init_t ethdev_init, void *init_params); + + +typedef int (*ethdev_uninit_t)(struct rte_eth_dev *ethdev); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * PMD helper function for cleaing up the resources of a ethdev port on it's + * destruction. + * + * @param ethdev + * ethdev handle of port. + * @param ethdev_uninit + * device specific port un-initialise callback function + * + * @return + * Negative errno value on error, 0 on success. + */ +int __rte_experimental +rte_eth_dev_destroy(struct rte_eth_dev *ethdev, ethdev_uninit_t ethdev_uninit); + +/** + * PMD helper function to check if keeping CRC is requested + * + * @note + * When CRC_STRIP offload flag is removed and default behavior switch to + * strip CRC, as planned, this helper function is not that useful and will be + * removed. In PMDs this function will be replaced with check: + * if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) + * + * @param rx_offloads + * offload bits to be applied + * + * @return + * Return positive if keeping CRC is requested, + * zero if stripping CRC is requested + */ +static inline int +rte_eth_dev_must_keep_crc(uint64_t rx_offloads) +{ + if (rx_offloads & DEV_RX_OFFLOAD_CRC_STRIP) + return 0; + + /* no KEEP_CRC or CRC_STRIP offload flags means keep CRC */ + return 1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ETHDEV_DRIVER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_pci.h b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_pci.h new file mode 100644 index 00000000..f652596f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_pci.h @@ -0,0 +1,210 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2017 Brocade Communications Systems, Inc. + * Author: Jan Blunck + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ETHDEV_PCI_H_ +#define _RTE_ETHDEV_PCI_H_ + +#include +#include +#include +#include +#include + +/** + * Copy pci device info to the Ethernet device data. + * + * @param eth_dev + * The *eth_dev* pointer is the address of the *rte_eth_dev* structure. + * @param pci_dev + * The *pci_dev* pointer is the address of the *rte_pci_device* structure. + */ +static inline void +rte_eth_copy_pci_info(struct rte_eth_dev *eth_dev, + struct rte_pci_device *pci_dev) +{ + if ((eth_dev == NULL) || (pci_dev == NULL)) { + RTE_ETHDEV_LOG(ERR, "NULL pointer eth_dev=%p pci_dev=%p", + (void *)eth_dev, (void *)pci_dev); + return; + } + + eth_dev->intr_handle = &pci_dev->intr_handle; + + eth_dev->data->dev_flags = 0; + if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC) + eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC; + if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_RMV) + eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV; + + eth_dev->data->kdrv = pci_dev->kdrv; + eth_dev->data->numa_node = pci_dev->device.numa_node; +} + +static inline int +eth_dev_pci_specific_init(struct rte_eth_dev *eth_dev, void *bus_device) { + struct rte_pci_device *pci_dev = bus_device; + + if (!pci_dev) + return -ENODEV; + + rte_eth_copy_pci_info(eth_dev, pci_dev); + + return 0; +} + +/** + * @internal + * Allocates a new ethdev slot for an ethernet device and returns the pointer + * to that slot for the driver to use. + * + * @param dev + * Pointer to the PCI device + * + * @param private_data_size + * Size of private data structure + * + * @return + * A pointer to a rte_eth_dev or NULL if allocation failed. + */ +static inline struct rte_eth_dev * +rte_eth_dev_pci_allocate(struct rte_pci_device *dev, size_t private_data_size) +{ + struct rte_eth_dev *eth_dev; + const char *name; + + if (!dev) + return NULL; + + name = dev->device.name; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + eth_dev = rte_eth_dev_allocate(name); + if (!eth_dev) + return NULL; + + if (private_data_size) { + eth_dev->data->dev_private = rte_zmalloc_socket(name, + private_data_size, RTE_CACHE_LINE_SIZE, + dev->device.numa_node); + if (!eth_dev->data->dev_private) { + rte_eth_dev_release_port(eth_dev); + return NULL; + } + } + } else { + eth_dev = rte_eth_dev_attach_secondary(name); + if (!eth_dev) + return NULL; + } + + eth_dev->device = &dev->device; + rte_eth_copy_pci_info(eth_dev, dev); + return eth_dev; +} + +static inline void +rte_eth_dev_pci_release(struct rte_eth_dev *eth_dev) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_free(eth_dev->data->dev_private); + + eth_dev->data->dev_private = NULL; + + /* + * Secondary process will check the name to attach. + * Clear this field to avoid attaching a released ports. + */ + eth_dev->data->name[0] = '\0'; + + eth_dev->device = NULL; + eth_dev->intr_handle = NULL; + + /* free ether device */ + rte_eth_dev_release_port(eth_dev); +} + +typedef int (*eth_dev_pci_callback_t)(struct rte_eth_dev *eth_dev); + +/** + * @internal + * Wrapper for use by pci drivers in a .probe function to attach to a ethdev + * interface. + */ +static inline int +rte_eth_dev_pci_generic_probe(struct rte_pci_device *pci_dev, + size_t private_data_size, eth_dev_pci_callback_t dev_init) +{ + struct rte_eth_dev *eth_dev; + int ret; + + eth_dev = rte_eth_dev_pci_allocate(pci_dev, private_data_size); + if (!eth_dev) + return -ENOMEM; + + RTE_FUNC_PTR_OR_ERR_RET(*dev_init, -EINVAL); + ret = dev_init(eth_dev); + if (ret) + rte_eth_dev_pci_release(eth_dev); + else + rte_eth_dev_probing_finish(eth_dev); + + return ret; +} + +/** + * @internal + * Wrapper for use by pci drivers in a .remove function to detach a ethdev + * interface. + */ +static inline int +rte_eth_dev_pci_generic_remove(struct rte_pci_device *pci_dev, + eth_dev_pci_callback_t dev_uninit) +{ + struct rte_eth_dev *eth_dev; + int ret; + + eth_dev = rte_eth_dev_allocated(pci_dev->device.name); + if (!eth_dev) + return -ENODEV; + + if (dev_uninit) { + ret = dev_uninit(eth_dev); + if (ret) + return ret; + } + + rte_eth_dev_pci_release(eth_dev); + return 0; +} + +#endif /* _RTE_ETHDEV_PCI_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_vdev.h b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_vdev.h new file mode 100644 index 00000000..259feda3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_vdev.h @@ -0,0 +1,84 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2017 Brocade Communications Systems, Inc. + * Author: Jan Blunck + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ETHDEV_VDEV_H_ +#define _RTE_ETHDEV_VDEV_H_ + +#include +#include +#include +#include + +/** + * @internal + * Allocates a new ethdev slot for an ethernet device and returns the pointer + * to that slot for the driver to use. + * + * @param dev + * Pointer to virtual device + * + * @param private_data_size + * Size of private data structure + * + * @return + * A pointer to a rte_eth_dev or NULL if allocation failed. + */ +static inline struct rte_eth_dev * +rte_eth_vdev_allocate(struct rte_vdev_device *dev, size_t private_data_size) +{ + struct rte_eth_dev *eth_dev; + const char *name = rte_vdev_device_name(dev); + + eth_dev = rte_eth_dev_allocate(name); + if (!eth_dev) + return NULL; + + if (private_data_size) { + eth_dev->data->dev_private = rte_zmalloc_socket(name, + private_data_size, RTE_CACHE_LINE_SIZE, + dev->device.numa_node); + if (!eth_dev->data->dev_private) { + rte_eth_dev_release_port(eth_dev); + return NULL; + } + } + + eth_dev->device = &dev->device; + eth_dev->intr_handle = NULL; + + eth_dev->data->kdrv = RTE_KDRV_NONE; + eth_dev->data->numa_node = dev->device.numa_node; + return eth_dev; +} + +#endif /* _RTE_ETHDEV_VDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_version.map b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_version.map new file mode 100644 index 00000000..38f117f0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_ethdev_version.map @@ -0,0 +1,255 @@ +DPDK_2.2 { + global: + + rte_eth_add_rx_callback; + rte_eth_add_tx_callback; + rte_eth_allmulticast_disable; + rte_eth_allmulticast_enable; + rte_eth_allmulticast_get; + rte_eth_dev_allocate; + rte_eth_dev_allocated; + rte_eth_dev_attach; + rte_eth_dev_callback_register; + rte_eth_dev_callback_unregister; + rte_eth_dev_close; + rte_eth_dev_configure; + rte_eth_dev_count; + rte_eth_dev_default_mac_addr_set; + rte_eth_dev_detach; + rte_eth_dev_filter_supported; + rte_eth_dev_flow_ctrl_get; + rte_eth_dev_flow_ctrl_set; + rte_eth_dev_get_dcb_info; + rte_eth_dev_get_eeprom; + rte_eth_dev_get_eeprom_length; + rte_eth_dev_get_mtu; + rte_eth_dev_get_reg_info; + rte_eth_dev_get_vlan_offload; + rte_eth_devices; + rte_eth_dev_info_get; + rte_eth_dev_is_valid_port; + rte_eth_dev_mac_addr_add; + rte_eth_dev_mac_addr_remove; + rte_eth_dev_priority_flow_ctrl_set; + rte_eth_dev_release_port; + rte_eth_dev_rss_hash_conf_get; + rte_eth_dev_rss_hash_update; + rte_eth_dev_rss_reta_query; + rte_eth_dev_rss_reta_update; + rte_eth_dev_rx_intr_ctl; + rte_eth_dev_rx_intr_ctl_q; + rte_eth_dev_rx_intr_disable; + rte_eth_dev_rx_intr_enable; + rte_eth_dev_rx_queue_start; + rte_eth_dev_rx_queue_stop; + rte_eth_dev_set_eeprom; + rte_eth_dev_set_link_down; + rte_eth_dev_set_link_up; + rte_eth_dev_set_mc_addr_list; + rte_eth_dev_set_mtu; + rte_eth_dev_set_rx_queue_stats_mapping; + rte_eth_dev_set_tx_queue_stats_mapping; + rte_eth_dev_set_vlan_offload; + rte_eth_dev_set_vlan_pvid; + rte_eth_dev_set_vlan_strip_on_queue; + rte_eth_dev_socket_id; + rte_eth_dev_start; + rte_eth_dev_stop; + rte_eth_dev_tx_queue_start; + rte_eth_dev_tx_queue_stop; + rte_eth_dev_uc_all_hash_table_set; + rte_eth_dev_uc_hash_table_set; + rte_eth_dev_vlan_filter; + rte_eth_dma_zone_reserve; + rte_eth_led_off; + rte_eth_led_on; + rte_eth_link; + rte_eth_link_get; + rte_eth_link_get_nowait; + rte_eth_macaddr_get; + rte_eth_mirror_rule_reset; + rte_eth_mirror_rule_set; + rte_eth_promiscuous_disable; + rte_eth_promiscuous_enable; + rte_eth_promiscuous_get; + rte_eth_remove_rx_callback; + rte_eth_remove_tx_callback; + rte_eth_rx_queue_info_get; + rte_eth_rx_queue_setup; + rte_eth_set_queue_rate_limit; + rte_eth_stats; + rte_eth_stats_get; + rte_eth_stats_reset; + rte_eth_timesync_adjust_time; + rte_eth_timesync_disable; + rte_eth_timesync_enable; + rte_eth_timesync_read_rx_timestamp; + rte_eth_timesync_read_time; + rte_eth_timesync_read_tx_timestamp; + rte_eth_timesync_write_time; + rte_eth_tx_queue_info_get; + rte_eth_tx_queue_setup; + rte_eth_xstats_get; + rte_eth_xstats_reset; + + local: *; +}; + +DPDK_16.04 { + global: + + rte_eth_dev_get_supported_ptypes; + rte_eth_dev_l2_tunnel_eth_type_conf; + rte_eth_dev_l2_tunnel_offload_set; + rte_eth_dev_set_vlan_ether_type; + rte_eth_dev_udp_tunnel_port_add; + rte_eth_dev_udp_tunnel_port_delete; + rte_eth_speed_bitflag; + rte_eth_tx_buffer_count_callback; + rte_eth_tx_buffer_drop_callback; + rte_eth_tx_buffer_init; + rte_eth_tx_buffer_set_err_callback; + +} DPDK_2.2; + +DPDK_16.07 { + global: + + rte_eth_add_first_rx_callback; + rte_eth_dev_get_name_by_port; + rte_eth_dev_get_port_by_name; + rte_eth_xstats_get_names; + +} DPDK_16.04; + +DPDK_17.02 { + global: + + _rte_eth_dev_reset; + rte_eth_dev_fw_version_get; + +} DPDK_16.07; + +DPDK_17.05 { + global: + + rte_eth_dev_attach_secondary; + rte_eth_find_next; + rte_eth_tx_done_cleanup; + rte_eth_xstats_get_by_id; + rte_eth_xstats_get_id_by_name; + rte_eth_xstats_get_names_by_id; + +} DPDK_17.02; + +DPDK_17.08 { + global: + + _rte_eth_dev_callback_process; + rte_eth_dev_adjust_nb_rx_tx_desc; + rte_tm_capabilities_get; + rte_tm_get_number_of_leaf_nodes; + rte_tm_hierarchy_commit; + rte_tm_level_capabilities_get; + rte_tm_mark_ip_dscp; + rte_tm_mark_ip_ecn; + rte_tm_mark_vlan_dei; + rte_tm_node_add; + rte_tm_node_capabilities_get; + rte_tm_node_cman_update; + rte_tm_node_delete; + rte_tm_node_parent_update; + rte_tm_node_resume; + rte_tm_node_shaper_update; + rte_tm_node_shared_shaper_update; + rte_tm_node_shared_wred_context_update; + rte_tm_node_stats_read; + rte_tm_node_stats_update; + rte_tm_node_suspend; + rte_tm_node_type_get; + rte_tm_node_wfq_weight_mode_update; + rte_tm_node_wred_context_update; + rte_tm_shaper_profile_add; + rte_tm_shaper_profile_delete; + rte_tm_shared_shaper_add_update; + rte_tm_shared_shaper_delete; + rte_tm_shared_wred_context_add_update; + rte_tm_shared_wred_context_delete; + rte_tm_wred_profile_add; + rte_tm_wred_profile_delete; + +} DPDK_17.05; + +DPDK_17.11 { + global: + + rte_eth_dev_get_sec_ctx; + rte_eth_dev_pool_ops_supported; + rte_eth_dev_reset; + +} DPDK_17.08; + +DPDK_18.02 { + global: + + rte_eth_dev_filter_ctrl; + +} DPDK_17.11; + +DPDK_18.05 { + global: + + rte_eth_dev_count_avail; + rte_eth_dev_probing_finish; + rte_eth_find_next_owned_by; + rte_flow_copy; + rte_flow_create; + rte_flow_destroy; + rte_flow_error_set; + rte_flow_flush; + rte_flow_isolate; + rte_flow_query; + rte_flow_validate; + +} DPDK_18.02; + +DPDK_18.08 { + global: + + rte_eth_dev_logtype; + +} DPDK_18.05; + +EXPERIMENTAL { + global: + + rte_eth_devargs_parse; + rte_eth_dev_count_total; + rte_eth_dev_create; + rte_eth_dev_destroy; + rte_eth_dev_get_module_eeprom; + rte_eth_dev_get_module_info; + rte_eth_dev_is_removed; + rte_eth_dev_owner_delete; + rte_eth_dev_owner_get; + rte_eth_dev_owner_new; + rte_eth_dev_owner_set; + rte_eth_dev_owner_unset; + rte_eth_dev_rx_offload_name; + rte_eth_dev_tx_offload_name; + rte_eth_switch_domain_alloc; + rte_eth_switch_domain_free; + rte_flow_expand_rss; + rte_mtr_capabilities_get; + rte_mtr_create; + rte_mtr_destroy; + rte_mtr_meter_disable; + rte_mtr_meter_dscp_table_update; + rte_mtr_meter_enable; + rte_mtr_meter_profile_add; + rte_mtr_meter_profile_delete; + rte_mtr_meter_profile_update; + rte_mtr_policer_actions_update; + rte_mtr_stats_read; + rte_mtr_stats_update; +}; diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_flow.c b/src/spdk/dpdk/lib/librte_ethdev/rte_flow.c new file mode 100644 index 00000000..cff4b520 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_flow.c @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd + */ + +#include +#include +#include +#include + +#include +#include +#include +#include "rte_ethdev.h" +#include "rte_flow_driver.h" +#include "rte_flow.h" + +/** + * Flow elements description tables. + */ +struct rte_flow_desc_data { + const char *name; + size_t size; +}; + +/** Generate flow_item[] entry. */ +#define MK_FLOW_ITEM(t, s) \ + [RTE_FLOW_ITEM_TYPE_ ## t] = { \ + .name = # t, \ + .size = s, \ + } + +/** Information about known flow pattern items. */ +static const struct rte_flow_desc_data rte_flow_desc_item[] = { + MK_FLOW_ITEM(END, 0), + MK_FLOW_ITEM(VOID, 0), + MK_FLOW_ITEM(INVERT, 0), + MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)), + MK_FLOW_ITEM(PF, 0), + MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)), + MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)), + MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)), + MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), + MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)), + MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)), + MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)), + MK_FLOW_ITEM(IPV6, sizeof(struct rte_flow_item_ipv6)), + MK_FLOW_ITEM(ICMP, sizeof(struct rte_flow_item_icmp)), + MK_FLOW_ITEM(UDP, sizeof(struct rte_flow_item_udp)), + MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)), + MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)), + MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)), + MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)), + MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)), + MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)), + MK_FLOW_ITEM(NVGRE, sizeof(struct rte_flow_item_nvgre)), + MK_FLOW_ITEM(GENEVE, sizeof(struct rte_flow_item_geneve)), + MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)), + MK_FLOW_ITEM(ARP_ETH_IPV4, sizeof(struct rte_flow_item_arp_eth_ipv4)), + MK_FLOW_ITEM(IPV6_EXT, sizeof(struct rte_flow_item_ipv6_ext)), + MK_FLOW_ITEM(ICMP6, sizeof(struct rte_flow_item_icmp6)), + MK_FLOW_ITEM(ICMP6_ND_NS, sizeof(struct rte_flow_item_icmp6_nd_ns)), + MK_FLOW_ITEM(ICMP6_ND_NA, sizeof(struct rte_flow_item_icmp6_nd_na)), + MK_FLOW_ITEM(ICMP6_ND_OPT, sizeof(struct rte_flow_item_icmp6_nd_opt)), + MK_FLOW_ITEM(ICMP6_ND_OPT_SLA_ETH, + sizeof(struct rte_flow_item_icmp6_nd_opt_sla_eth)), + MK_FLOW_ITEM(ICMP6_ND_OPT_TLA_ETH, + sizeof(struct rte_flow_item_icmp6_nd_opt_tla_eth)), +}; + +/** Generate flow_action[] entry. */ +#define MK_FLOW_ACTION(t, s) \ + [RTE_FLOW_ACTION_TYPE_ ## t] = { \ + .name = # t, \ + .size = s, \ + } + +/** Information about known flow actions. */ +static const struct rte_flow_desc_data rte_flow_desc_action[] = { + MK_FLOW_ACTION(END, 0), + MK_FLOW_ACTION(VOID, 0), + MK_FLOW_ACTION(PASSTHRU, 0), + MK_FLOW_ACTION(MARK, sizeof(struct rte_flow_action_mark)), + MK_FLOW_ACTION(FLAG, 0), + MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)), + MK_FLOW_ACTION(DROP, 0), + MK_FLOW_ACTION(COUNT, sizeof(struct rte_flow_action_count)), + MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), + MK_FLOW_ACTION(PF, 0), + MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)), + MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)), + MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)), + MK_FLOW_ACTION(OF_SET_MPLS_TTL, + sizeof(struct rte_flow_action_of_set_mpls_ttl)), + MK_FLOW_ACTION(OF_DEC_MPLS_TTL, 0), + MK_FLOW_ACTION(OF_SET_NW_TTL, + sizeof(struct rte_flow_action_of_set_nw_ttl)), + MK_FLOW_ACTION(OF_DEC_NW_TTL, 0), + MK_FLOW_ACTION(OF_COPY_TTL_OUT, 0), + MK_FLOW_ACTION(OF_COPY_TTL_IN, 0), + MK_FLOW_ACTION(OF_POP_VLAN, 0), + MK_FLOW_ACTION(OF_PUSH_VLAN, + sizeof(struct rte_flow_action_of_push_vlan)), + MK_FLOW_ACTION(OF_SET_VLAN_VID, + sizeof(struct rte_flow_action_of_set_vlan_vid)), + MK_FLOW_ACTION(OF_SET_VLAN_PCP, + sizeof(struct rte_flow_action_of_set_vlan_pcp)), + MK_FLOW_ACTION(OF_POP_MPLS, + sizeof(struct rte_flow_action_of_pop_mpls)), + MK_FLOW_ACTION(OF_PUSH_MPLS, + sizeof(struct rte_flow_action_of_push_mpls)), +}; + +static int +flow_err(uint16_t port_id, int ret, struct rte_flow_error *error) +{ + if (ret == 0) + return 0; + if (rte_eth_dev_is_removed(port_id)) + return rte_flow_error_set(error, EIO, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(EIO)); + return ret; +} + +/* Get generic flow operations structure from a port. */ +const struct rte_flow_ops * +rte_flow_ops_get(uint16_t port_id, struct rte_flow_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_flow_ops *ops; + int code; + + if (unlikely(!rte_eth_dev_is_valid_port(port_id))) + code = ENODEV; + else if (unlikely(!dev->dev_ops->filter_ctrl || + dev->dev_ops->filter_ctrl(dev, + RTE_ETH_FILTER_GENERIC, + RTE_ETH_FILTER_GET, + &ops) || + !ops)) + code = ENOSYS; + else + return ops; + rte_flow_error_set(error, code, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(code)); + return NULL; +} + +/* Check whether a flow rule can be created on a given port. */ +int +rte_flow_validate(uint16_t port_id, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error); + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + + if (unlikely(!ops)) + return -rte_errno; + if (likely(!!ops->validate)) + return flow_err(port_id, ops->validate(dev, attr, pattern, + actions, error), error); + return rte_flow_error_set(error, ENOSYS, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(ENOSYS)); +} + +/* Create a flow rule on a given port. */ +struct rte_flow * +rte_flow_create(uint16_t port_id, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + struct rte_flow *flow; + const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error); + + if (unlikely(!ops)) + return NULL; + if (likely(!!ops->create)) { + flow = ops->create(dev, attr, pattern, actions, error); + if (flow == NULL) + flow_err(port_id, -rte_errno, error); + return flow; + } + rte_flow_error_set(error, ENOSYS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(ENOSYS)); + return NULL; +} + +/* Destroy a flow rule on a given port. */ +int +rte_flow_destroy(uint16_t port_id, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error); + + if (unlikely(!ops)) + return -rte_errno; + if (likely(!!ops->destroy)) + return flow_err(port_id, ops->destroy(dev, flow, error), + error); + return rte_flow_error_set(error, ENOSYS, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(ENOSYS)); +} + +/* Destroy all flow rules associated with a port. */ +int +rte_flow_flush(uint16_t port_id, + struct rte_flow_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error); + + if (unlikely(!ops)) + return -rte_errno; + if (likely(!!ops->flush)) + return flow_err(port_id, ops->flush(dev, error), error); + return rte_flow_error_set(error, ENOSYS, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(ENOSYS)); +} + +/* Query an existing flow rule. */ +int +rte_flow_query(uint16_t port_id, + struct rte_flow *flow, + const struct rte_flow_action *action, + void *data, + struct rte_flow_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error); + + if (!ops) + return -rte_errno; + if (likely(!!ops->query)) + return flow_err(port_id, ops->query(dev, flow, action, data, + error), error); + return rte_flow_error_set(error, ENOSYS, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(ENOSYS)); +} + +/* Restrict ingress traffic to the defined flow rules. */ +int +rte_flow_isolate(uint16_t port_id, + int set, + struct rte_flow_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error); + + if (!ops) + return -rte_errno; + if (likely(!!ops->isolate)) + return flow_err(port_id, ops->isolate(dev, set, error), error); + return rte_flow_error_set(error, ENOSYS, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, rte_strerror(ENOSYS)); +} + +/* Initialize flow error structure. */ +int +rte_flow_error_set(struct rte_flow_error *error, + int code, + enum rte_flow_error_type type, + const void *cause, + const char *message) +{ + if (error) { + *error = (struct rte_flow_error){ + .type = type, + .cause = cause, + .message = message, + }; + } + rte_errno = code; + return -code; +} + +/** Pattern item specification types. */ +enum item_spec_type { + ITEM_SPEC, + ITEM_LAST, + ITEM_MASK, +}; + +/** Compute storage space needed by item specification and copy it. */ +static size_t +flow_item_spec_copy(void *buf, const struct rte_flow_item *item, + enum item_spec_type type) +{ + size_t size = 0; + const void *data = + type == ITEM_SPEC ? item->spec : + type == ITEM_LAST ? item->last : + type == ITEM_MASK ? item->mask : + NULL; + + if (!item->spec || !data) + goto empty; + switch (item->type) { + union { + const struct rte_flow_item_raw *raw; + } spec; + union { + const struct rte_flow_item_raw *raw; + } last; + union { + const struct rte_flow_item_raw *raw; + } mask; + union { + const struct rte_flow_item_raw *raw; + } src; + union { + struct rte_flow_item_raw *raw; + } dst; + size_t off; + + case RTE_FLOW_ITEM_TYPE_RAW: + spec.raw = item->spec; + last.raw = item->last ? item->last : item->spec; + mask.raw = item->mask ? item->mask : &rte_flow_item_raw_mask; + src.raw = data; + dst.raw = buf; + off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw), + sizeof(*src.raw->pattern)); + if (type == ITEM_SPEC || + (type == ITEM_MASK && + ((spec.raw->length & mask.raw->length) >= + (last.raw->length & mask.raw->length)))) + size = spec.raw->length & mask.raw->length; + else + size = last.raw->length & mask.raw->length; + size = off + size * sizeof(*src.raw->pattern); + if (dst.raw) { + memcpy(dst.raw, src.raw, sizeof(*src.raw)); + dst.raw->pattern = memcpy((uint8_t *)dst.raw + off, + src.raw->pattern, + size - off); + } + break; + default: + size = rte_flow_desc_item[item->type].size; + if (buf) + memcpy(buf, data, size); + break; + } +empty: + return RTE_ALIGN_CEIL(size, sizeof(double)); +} + +/** Compute storage space needed by action configuration and copy it. */ +static size_t +flow_action_conf_copy(void *buf, const struct rte_flow_action *action) +{ + size_t size = 0; + + if (!action->conf) + goto empty; + switch (action->type) { + union { + const struct rte_flow_action_rss *rss; + } src; + union { + struct rte_flow_action_rss *rss; + } dst; + size_t off; + + case RTE_FLOW_ACTION_TYPE_RSS: + src.rss = action->conf; + dst.rss = buf; + off = 0; + if (dst.rss) + *dst.rss = (struct rte_flow_action_rss){ + .func = src.rss->func, + .level = src.rss->level, + .types = src.rss->types, + .key_len = src.rss->key_len, + .queue_num = src.rss->queue_num, + }; + off += sizeof(*src.rss); + if (src.rss->key_len) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + size = sizeof(*src.rss->key) * src.rss->key_len; + if (dst.rss) + dst.rss->key = memcpy + ((void *)((uintptr_t)dst.rss + off), + src.rss->key, size); + off += size; + } + if (src.rss->queue_num) { + off = RTE_ALIGN_CEIL(off, sizeof(double)); + size = sizeof(*src.rss->queue) * src.rss->queue_num; + if (dst.rss) + dst.rss->queue = memcpy + ((void *)((uintptr_t)dst.rss + off), + src.rss->queue, size); + off += size; + } + size = off; + break; + default: + size = rte_flow_desc_action[action->type].size; + if (buf) + memcpy(buf, action->conf, size); + break; + } +empty: + return RTE_ALIGN_CEIL(size, sizeof(double)); +} + +/** Store a full rte_flow description. */ +size_t +rte_flow_copy(struct rte_flow_desc *desc, size_t len, + const struct rte_flow_attr *attr, + const struct rte_flow_item *items, + const struct rte_flow_action *actions) +{ + struct rte_flow_desc *fd = NULL; + size_t tmp; + size_t off1 = 0; + size_t off2 = 0; + size_t size = 0; + +store: + if (items) { + const struct rte_flow_item *item; + + item = items; + if (fd) + fd->items = (void *)&fd->data[off1]; + do { + struct rte_flow_item *dst = NULL; + + if ((size_t)item->type >= + RTE_DIM(rte_flow_desc_item) || + !rte_flow_desc_item[item->type].name) { + rte_errno = ENOTSUP; + return 0; + } + if (fd) + dst = memcpy(fd->data + off1, item, + sizeof(*item)); + off1 += sizeof(*item); + if (item->spec) { + if (fd) + dst->spec = fd->data + off2; + off2 += flow_item_spec_copy + (fd ? fd->data + off2 : NULL, item, + ITEM_SPEC); + } + if (item->last) { + if (fd) + dst->last = fd->data + off2; + off2 += flow_item_spec_copy + (fd ? fd->data + off2 : NULL, item, + ITEM_LAST); + } + if (item->mask) { + if (fd) + dst->mask = fd->data + off2; + off2 += flow_item_spec_copy + (fd ? fd->data + off2 : NULL, item, + ITEM_MASK); + } + off2 = RTE_ALIGN_CEIL(off2, sizeof(double)); + } while ((item++)->type != RTE_FLOW_ITEM_TYPE_END); + off1 = RTE_ALIGN_CEIL(off1, sizeof(double)); + } + if (actions) { + const struct rte_flow_action *action; + + action = actions; + if (fd) + fd->actions = (void *)&fd->data[off1]; + do { + struct rte_flow_action *dst = NULL; + + if ((size_t)action->type >= + RTE_DIM(rte_flow_desc_action) || + !rte_flow_desc_action[action->type].name) { + rte_errno = ENOTSUP; + return 0; + } + if (fd) + dst = memcpy(fd->data + off1, action, + sizeof(*action)); + off1 += sizeof(*action); + if (action->conf) { + if (fd) + dst->conf = fd->data + off2; + off2 += flow_action_conf_copy + (fd ? fd->data + off2 : NULL, action); + } + off2 = RTE_ALIGN_CEIL(off2, sizeof(double)); + } while ((action++)->type != RTE_FLOW_ACTION_TYPE_END); + } + if (fd != NULL) + return size; + off1 = RTE_ALIGN_CEIL(off1, sizeof(double)); + tmp = RTE_ALIGN_CEIL(offsetof(struct rte_flow_desc, data), + sizeof(double)); + size = tmp + off1 + off2; + if (size > len) + return size; + fd = desc; + if (fd != NULL) { + *fd = (const struct rte_flow_desc) { + .size = size, + .attr = *attr, + }; + tmp -= offsetof(struct rte_flow_desc, data); + off2 = tmp + off1; + off1 = tmp; + goto store; + } + return 0; +} + +/** + * Expand RSS flows into several possible flows according to the RSS hash + * fields requested and the driver capabilities. + */ +int __rte_experimental +rte_flow_expand_rss(struct rte_flow_expand_rss *buf, size_t size, + const struct rte_flow_item *pattern, uint64_t types, + const struct rte_flow_expand_node graph[], + int graph_root_index) +{ + const int elt_n = 8; + const struct rte_flow_item *item; + const struct rte_flow_expand_node *node = &graph[graph_root_index]; + const int *next_node; + const int *stack[elt_n]; + int stack_pos = 0; + struct rte_flow_item flow_items[elt_n]; + unsigned int i; + size_t lsize; + size_t user_pattern_size = 0; + void *addr = NULL; + + lsize = offsetof(struct rte_flow_expand_rss, entry) + + elt_n * sizeof(buf->entry[0]); + if (lsize <= size) { + buf->entry[0].priority = 0; + buf->entry[0].pattern = (void *)&buf->entry[elt_n]; + buf->entries = 0; + addr = buf->entry[0].pattern; + } + for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { + const struct rte_flow_expand_node *next = NULL; + + for (i = 0; node->next && node->next[i]; ++i) { + next = &graph[node->next[i]]; + if (next->type == item->type) + break; + } + if (next) + node = next; + user_pattern_size += sizeof(*item); + } + user_pattern_size += sizeof(*item); /* Handle END item. */ + lsize += user_pattern_size; + /* Copy the user pattern in the first entry of the buffer. */ + if (lsize <= size) { + rte_memcpy(addr, pattern, user_pattern_size); + addr = (void *)(((uintptr_t)addr) + user_pattern_size); + buf->entries = 1; + } + /* Start expanding. */ + memset(flow_items, 0, sizeof(flow_items)); + user_pattern_size -= sizeof(*item); + next_node = node->next; + stack[stack_pos] = next_node; + node = next_node ? &graph[*next_node] : NULL; + while (node) { + flow_items[stack_pos].type = node->type; + if (node->rss_types & types) { + /* + * compute the number of items to copy from the + * expansion and copy it. + * When the stack_pos is 0, there are 1 element in it, + * plus the addition END item. + */ + int elt = stack_pos + 2; + + flow_items[stack_pos + 1].type = RTE_FLOW_ITEM_TYPE_END; + lsize += elt * sizeof(*item) + user_pattern_size; + if (lsize <= size) { + size_t n = elt * sizeof(*item); + + buf->entry[buf->entries].priority = + stack_pos + 1; + buf->entry[buf->entries].pattern = addr; + buf->entries++; + rte_memcpy(addr, buf->entry[0].pattern, + user_pattern_size); + addr = (void *)(((uintptr_t)addr) + + user_pattern_size); + rte_memcpy(addr, flow_items, n); + addr = (void *)(((uintptr_t)addr) + n); + } + } + /* Go deeper. */ + if (node->next) { + next_node = node->next; + if (stack_pos++ == elt_n) { + rte_errno = E2BIG; + return -rte_errno; + } + stack[stack_pos] = next_node; + } else if (*(next_node + 1)) { + /* Follow up with the next possibility. */ + ++next_node; + } else { + /* Move to the next path. */ + if (stack_pos) + next_node = stack[--stack_pos]; + next_node++; + stack[stack_pos] = next_node; + } + node = *next_node ? &graph[*next_node] : NULL; + }; + return lsize; +} diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_flow.h b/src/spdk/dpdk/lib/librte_ethdev/rte_flow.h new file mode 100644 index 00000000..f8ba71cd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_flow.h @@ -0,0 +1,2208 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd + */ + +#ifndef RTE_FLOW_H_ +#define RTE_FLOW_H_ + +/** + * @file + * RTE generic flow API + * + * This interface provides the ability to program packet matching and + * associated actions in hardware through flow rules. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Flow rule attributes. + * + * Priorities are set on a per rule based within groups. + * + * Lower values denote higher priority, the highest priority for a flow rule + * is 0, so that a flow that matches for than one rule, the rule with the + * lowest priority value will always be matched. + * + * Although optional, applications are encouraged to group similar rules as + * much as possible to fully take advantage of hardware capabilities + * (e.g. optimized matching) and work around limitations (e.g. a single + * pattern type possibly allowed in a given group). Applications should be + * aware that groups are not linked by default, and that they must be + * explicitly linked by the application using the JUMP action. + * + * Priority levels are arbitrary and up to the application, they + * do not need to be contiguous nor start from 0, however the maximum number + * varies between devices and may be affected by existing flow rules. + * + * If a packet is matched by several rules of a given group for a given + * priority level, the outcome is undefined. It can take any path, may be + * duplicated or even cause unrecoverable errors. + * + * Note that support for more than a single group and priority level is not + * guaranteed. + * + * Flow rules can apply to inbound and/or outbound traffic (ingress/egress). + * + * Several pattern items and actions are valid and can be used in both + * directions. Those valid for only one direction are described as such. + * + * At least one direction must be specified. + * + * Specifying both directions at once for a given rule is not recommended + * but may be valid in a few cases (e.g. shared counter). + */ +struct rte_flow_attr { + uint32_t group; /**< Priority group. */ + uint32_t priority; /**< Rule priority level within group. */ + uint32_t ingress:1; /**< Rule applies to ingress traffic. */ + uint32_t egress:1; /**< Rule applies to egress traffic. */ + /** + * Instead of simply matching the properties of traffic as it would + * appear on a given DPDK port ID, enabling this attribute transfers + * a flow rule to the lowest possible level of any device endpoints + * found in the pattern. + * + * When supported, this effectively enables an application to + * re-route traffic not necessarily intended for it (e.g. coming + * from or addressed to different physical ports, VFs or + * applications) at the device level. + * + * It complements the behavior of some pattern items such as + * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them. + * + * When transferring flow rules, ingress and egress attributes keep + * their original meaning, as if processing traffic emitted or + * received by the application. + */ + uint32_t transfer:1; + uint32_t reserved:29; /**< Reserved, must be zero. */ +}; + +/** + * Matching pattern item types. + * + * Pattern items fall in two categories: + * + * - Matching protocol headers and packet data, usually associated with a + * specification structure. These must be stacked in the same order as the + * protocol layers to match inside packets, starting from the lowest. + * + * - Matching meta-data or affecting pattern processing, often without a + * specification structure. Since they do not match packet contents, their + * position in the list is usually not relevant. + * + * See the description of individual types for more information. Those + * marked with [META] fall into the second category. + */ +enum rte_flow_item_type { + /** + * [META] + * + * End marker for item lists. Prevents further processing of items, + * thereby ending the pattern. + * + * No associated specification structure. + */ + RTE_FLOW_ITEM_TYPE_END, + + /** + * [META] + * + * Used as a placeholder for convenience. It is ignored and simply + * discarded by PMDs. + * + * No associated specification structure. + */ + RTE_FLOW_ITEM_TYPE_VOID, + + /** + * [META] + * + * Inverted matching, i.e. process packets that do not match the + * pattern. + * + * No associated specification structure. + */ + RTE_FLOW_ITEM_TYPE_INVERT, + + /** + * Matches any protocol in place of the current layer, a single ANY + * may also stand for several protocol layers. + * + * See struct rte_flow_item_any. + */ + RTE_FLOW_ITEM_TYPE_ANY, + + /** + * [META] + * + * Matches traffic originating from (ingress) or going to (egress) + * the physical function of the current device. + * + * No associated specification structure. + */ + RTE_FLOW_ITEM_TYPE_PF, + + /** + * [META] + * + * Matches traffic originating from (ingress) or going to (egress) a + * given virtual function of the current device. + * + * See struct rte_flow_item_vf. + */ + RTE_FLOW_ITEM_TYPE_VF, + + /** + * [META] + * + * Matches traffic originating from (ingress) or going to (egress) a + * physical port of the underlying device. + * + * See struct rte_flow_item_phy_port. + */ + RTE_FLOW_ITEM_TYPE_PHY_PORT, + + /** + * [META] + * + * Matches traffic originating from (ingress) or going to (egress) a + * given DPDK port ID. + * + * See struct rte_flow_item_port_id. + */ + RTE_FLOW_ITEM_TYPE_PORT_ID, + + /** + * Matches a byte string of a given length at a given offset. + * + * See struct rte_flow_item_raw. + */ + RTE_FLOW_ITEM_TYPE_RAW, + + /** + * Matches an Ethernet header. + * + * See struct rte_flow_item_eth. + */ + RTE_FLOW_ITEM_TYPE_ETH, + + /** + * Matches an 802.1Q/ad VLAN tag. + * + * See struct rte_flow_item_vlan. + */ + RTE_FLOW_ITEM_TYPE_VLAN, + + /** + * Matches an IPv4 header. + * + * See struct rte_flow_item_ipv4. + */ + RTE_FLOW_ITEM_TYPE_IPV4, + + /** + * Matches an IPv6 header. + * + * See struct rte_flow_item_ipv6. + */ + RTE_FLOW_ITEM_TYPE_IPV6, + + /** + * Matches an ICMP header. + * + * See struct rte_flow_item_icmp. + */ + RTE_FLOW_ITEM_TYPE_ICMP, + + /** + * Matches a UDP header. + * + * See struct rte_flow_item_udp. + */ + RTE_FLOW_ITEM_TYPE_UDP, + + /** + * Matches a TCP header. + * + * See struct rte_flow_item_tcp. + */ + RTE_FLOW_ITEM_TYPE_TCP, + + /** + * Matches a SCTP header. + * + * See struct rte_flow_item_sctp. + */ + RTE_FLOW_ITEM_TYPE_SCTP, + + /** + * Matches a VXLAN header. + * + * See struct rte_flow_item_vxlan. + */ + RTE_FLOW_ITEM_TYPE_VXLAN, + + /** + * Matches a E_TAG header. + * + * See struct rte_flow_item_e_tag. + */ + RTE_FLOW_ITEM_TYPE_E_TAG, + + /** + * Matches a NVGRE header. + * + * See struct rte_flow_item_nvgre. + */ + RTE_FLOW_ITEM_TYPE_NVGRE, + + /** + * Matches a MPLS header. + * + * See struct rte_flow_item_mpls. + */ + RTE_FLOW_ITEM_TYPE_MPLS, + + /** + * Matches a GRE header. + * + * See struct rte_flow_item_gre. + */ + RTE_FLOW_ITEM_TYPE_GRE, + + /** + * [META] + * + * Fuzzy pattern match, expect faster than default. + * + * This is for device that support fuzzy matching option. + * Usually a fuzzy matching is fast but the cost is accuracy. + * + * See struct rte_flow_item_fuzzy. + */ + RTE_FLOW_ITEM_TYPE_FUZZY, + + /** + * Matches a GTP header. + * + * Configure flow for GTP packets. + * + * See struct rte_flow_item_gtp. + */ + RTE_FLOW_ITEM_TYPE_GTP, + + /** + * Matches a GTP header. + * + * Configure flow for GTP-C packets. + * + * See struct rte_flow_item_gtp. + */ + RTE_FLOW_ITEM_TYPE_GTPC, + + /** + * Matches a GTP header. + * + * Configure flow for GTP-U packets. + * + * See struct rte_flow_item_gtp. + */ + RTE_FLOW_ITEM_TYPE_GTPU, + + /** + * Matches a ESP header. + * + * See struct rte_flow_item_esp. + */ + RTE_FLOW_ITEM_TYPE_ESP, + + /** + * Matches a GENEVE header. + * + * See struct rte_flow_item_geneve. + */ + RTE_FLOW_ITEM_TYPE_GENEVE, + + /** + * Matches a VXLAN-GPE header. + * + * See struct rte_flow_item_vxlan_gpe. + */ + RTE_FLOW_ITEM_TYPE_VXLAN_GPE, + + /** + * Matches an ARP header for Ethernet/IPv4. + * + * See struct rte_flow_item_arp_eth_ipv4. + */ + RTE_FLOW_ITEM_TYPE_ARP_ETH_IPV4, + + /** + * Matches the presence of any IPv6 extension header. + * + * See struct rte_flow_item_ipv6_ext. + */ + RTE_FLOW_ITEM_TYPE_IPV6_EXT, + + /** + * Matches any ICMPv6 header. + * + * See struct rte_flow_item_icmp6. + */ + RTE_FLOW_ITEM_TYPE_ICMP6, + + /** + * Matches an ICMPv6 neighbor discovery solicitation. + * + * See struct rte_flow_item_icmp6_nd_ns. + */ + RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS, + + /** + * Matches an ICMPv6 neighbor discovery advertisement. + * + * See struct rte_flow_item_icmp6_nd_na. + */ + RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA, + + /** + * Matches the presence of any ICMPv6 neighbor discovery option. + * + * See struct rte_flow_item_icmp6_nd_opt. + */ + RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT, + + /** + * Matches an ICMPv6 neighbor discovery source Ethernet link-layer + * address option. + * + * See struct rte_flow_item_icmp6_nd_opt_sla_eth. + */ + RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA_ETH, + + /** + * Matches an ICMPv6 neighbor discovery target Ethernet link-layer + * address option. + * + * See struct rte_flow_item_icmp6_nd_opt_tla_eth. + */ + RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA_ETH, + + /** + * Matches specified mark field. + * + * See struct rte_flow_item_mark. + */ + RTE_FLOW_ITEM_TYPE_MARK, +}; + +/** + * RTE_FLOW_ITEM_TYPE_ANY + * + * Matches any protocol in place of the current layer, a single ANY may also + * stand for several protocol layers. + * + * This is usually specified as the first pattern item when looking for a + * protocol anywhere in a packet. + * + * A zeroed mask stands for any number of layers. + */ +struct rte_flow_item_any { + uint32_t num; /**< Number of layers covered. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ANY. */ +#ifndef __cplusplus +static const struct rte_flow_item_any rte_flow_item_any_mask = { + .num = 0x00000000, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_VF + * + * Matches traffic originating from (ingress) or going to (egress) a given + * virtual function of the current device. + * + * If supported, should work even if the virtual function is not managed by + * the application and thus not associated with a DPDK port ID. + * + * Note this pattern item does not match VF representors traffic which, as + * separate entities, should be addressed through their own DPDK port IDs. + * + * - Can be specified multiple times to match traffic addressed to several + * VF IDs. + * - Can be combined with a PF item to match both PF and VF traffic. + * + * A zeroed mask can be used to match any VF ID. + */ +struct rte_flow_item_vf { + uint32_t id; /**< VF ID. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_VF. */ +#ifndef __cplusplus +static const struct rte_flow_item_vf rte_flow_item_vf_mask = { + .id = 0x00000000, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_PHY_PORT + * + * Matches traffic originating from (ingress) or going to (egress) a + * physical port of the underlying device. + * + * The first PHY_PORT item overrides the physical port normally associated + * with the specified DPDK input port (port_id). This item can be provided + * several times to match additional physical ports. + * + * Note that physical ports are not necessarily tied to DPDK input ports + * (port_id) when those are not under DPDK control. Possible values are + * specific to each device, they are not necessarily indexed from zero and + * may not be contiguous. + * + * As a device property, the list of allowed values as well as the value + * associated with a port_id should be retrieved by other means. + * + * A zeroed mask can be used to match any port index. + */ +struct rte_flow_item_phy_port { + uint32_t index; /**< Physical port index. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_PHY_PORT. */ +#ifndef __cplusplus +static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = { + .index = 0x00000000, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_PORT_ID + * + * Matches traffic originating from (ingress) or going to (egress) a given + * DPDK port ID. + * + * Normally only supported if the port ID in question is known by the + * underlying PMD and related to the device the flow rule is created + * against. + * + * This must not be confused with @p PHY_PORT which refers to the physical + * port of a device, whereas @p PORT_ID refers to a struct rte_eth_dev + * object on the application side (also known as "port representor" + * depending on the kind of underlying device). + */ +struct rte_flow_item_port_id { + uint32_t id; /**< DPDK port ID. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_PORT_ID. */ +#ifndef __cplusplus +static const struct rte_flow_item_port_id rte_flow_item_port_id_mask = { + .id = 0xffffffff, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_RAW + * + * Matches a byte string of a given length at a given offset. + * + * Offset is either absolute (using the start of the packet) or relative to + * the end of the previous matched item in the stack, in which case negative + * values are allowed. + * + * If search is enabled, offset is used as the starting point. The search + * area can be delimited by setting limit to a nonzero value, which is the + * maximum number of bytes after offset where the pattern may start. + * + * Matching a zero-length pattern is allowed, doing so resets the relative + * offset for subsequent items. + * + * This type does not support ranges (struct rte_flow_item.last). + */ +struct rte_flow_item_raw { + uint32_t relative:1; /**< Look for pattern after the previous item. */ + uint32_t search:1; /**< Search pattern from offset (see also limit). */ + uint32_t reserved:30; /**< Reserved, must be set to zero. */ + int32_t offset; /**< Absolute or relative offset for pattern. */ + uint16_t limit; /**< Search area limit for start of pattern. */ + uint16_t length; /**< Pattern length. */ + const uint8_t *pattern; /**< Byte string to look for. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */ +#ifndef __cplusplus +static const struct rte_flow_item_raw rte_flow_item_raw_mask = { + .relative = 1, + .search = 1, + .reserved = 0x3fffffff, + .offset = 0xffffffff, + .limit = 0xffff, + .length = 0xffff, + .pattern = NULL, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ETH + * + * Matches an Ethernet header. + * + * The @p type field either stands for "EtherType" or "TPID" when followed + * by so-called layer 2.5 pattern items such as RTE_FLOW_ITEM_TYPE_VLAN. In + * the latter case, @p type refers to that of the outer header, with the + * inner EtherType/TPID provided by the subsequent pattern item. This is the + * same order as on the wire. + */ +struct rte_flow_item_eth { + struct ether_addr dst; /**< Destination MAC. */ + struct ether_addr src; /**< Source MAC. */ + rte_be16_t type; /**< EtherType or TPID. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ETH. */ +#ifndef __cplusplus +static const struct rte_flow_item_eth rte_flow_item_eth_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = RTE_BE16(0x0000), +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_VLAN + * + * Matches an 802.1Q/ad VLAN tag. + * + * The corresponding standard outer EtherType (TPID) values are + * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding + * pattern item. + */ +struct rte_flow_item_vlan { + rte_be16_t tci; /**< Tag control information. */ + rte_be16_t inner_type; /**< Inner EtherType or TPID. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */ +#ifndef __cplusplus +static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = { + .tci = RTE_BE16(0x0fff), + .inner_type = RTE_BE16(0x0000), +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_IPV4 + * + * Matches an IPv4 header. + * + * Note: IPv4 options are handled by dedicated pattern items. + */ +struct rte_flow_item_ipv4 { + struct ipv4_hdr hdr; /**< IPv4 header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV4. */ +#ifndef __cplusplus +static const struct rte_flow_item_ipv4 rte_flow_item_ipv4_mask = { + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_IPV6. + * + * Matches an IPv6 header. + * + * Note: IPv6 options are handled by dedicated pattern items, see + * RTE_FLOW_ITEM_TYPE_IPV6_EXT. + */ +struct rte_flow_item_ipv6 { + struct ipv6_hdr hdr; /**< IPv6 header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6. */ +#ifndef __cplusplus +static const struct rte_flow_item_ipv6 rte_flow_item_ipv6_mask = { + .hdr = { + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP. + * + * Matches an ICMP header. + */ +struct rte_flow_item_icmp { + struct icmp_hdr hdr; /**< ICMP header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP. */ +#ifndef __cplusplus +static const struct rte_flow_item_icmp rte_flow_item_icmp_mask = { + .hdr = { + .icmp_type = 0xff, + .icmp_code = 0xff, + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_UDP. + * + * Matches a UDP header. + */ +struct rte_flow_item_udp { + struct udp_hdr hdr; /**< UDP header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_UDP. */ +#ifndef __cplusplus +static const struct rte_flow_item_udp rte_flow_item_udp_mask = { + .hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_TCP. + * + * Matches a TCP header. + */ +struct rte_flow_item_tcp { + struct tcp_hdr hdr; /**< TCP header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_TCP. */ +#ifndef __cplusplus +static const struct rte_flow_item_tcp rte_flow_item_tcp_mask = { + .hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_SCTP. + * + * Matches a SCTP header. + */ +struct rte_flow_item_sctp { + struct sctp_hdr hdr; /**< SCTP header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_SCTP. */ +#ifndef __cplusplus +static const struct rte_flow_item_sctp rte_flow_item_sctp_mask = { + .hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_VXLAN. + * + * Matches a VXLAN header (RFC 7348). + */ +struct rte_flow_item_vxlan { + uint8_t flags; /**< Normally 0x08 (I flag). */ + uint8_t rsvd0[3]; /**< Reserved, normally 0x000000. */ + uint8_t vni[3]; /**< VXLAN identifier. */ + uint8_t rsvd1; /**< Reserved, normally 0x00. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_VXLAN. */ +#ifndef __cplusplus +static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = { + .vni = "\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_E_TAG. + * + * Matches a E-tag header. + * + * The corresponding standard outer EtherType (TPID) value is + * ETHER_TYPE_ETAG. It can be overridden by the preceding pattern item. + */ +struct rte_flow_item_e_tag { + /** + * E-Tag control information (E-TCI). + * E-PCP (3b), E-DEI (1b), ingress E-CID base (12b). + */ + rte_be16_t epcp_edei_in_ecid_b; + /** Reserved (2b), GRP (2b), E-CID base (12b). */ + rte_be16_t rsvd_grp_ecid_b; + uint8_t in_ecid_e; /**< Ingress E-CID ext. */ + uint8_t ecid_e; /**< E-CID ext. */ + rte_be16_t inner_type; /**< Inner EtherType or TPID. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_E_TAG. */ +#ifndef __cplusplus +static const struct rte_flow_item_e_tag rte_flow_item_e_tag_mask = { + .rsvd_grp_ecid_b = RTE_BE16(0x3fff), +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_NVGRE. + * + * Matches a NVGRE header. + */ +struct rte_flow_item_nvgre { + /** + * Checksum (1b), undefined (1b), key bit (1b), sequence number (1b), + * reserved 0 (9b), version (3b). + * + * c_k_s_rsvd0_ver must have value 0x2000 according to RFC 7637. + */ + rte_be16_t c_k_s_rsvd0_ver; + rte_be16_t protocol; /**< Protocol type (0x6558). */ + uint8_t tni[3]; /**< Virtual subnet ID. */ + uint8_t flow_id; /**< Flow ID. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_NVGRE. */ +#ifndef __cplusplus +static const struct rte_flow_item_nvgre rte_flow_item_nvgre_mask = { + .tni = "\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_MPLS. + * + * Matches a MPLS header. + */ +struct rte_flow_item_mpls { + /** + * Label (20b), TC (3b), Bottom of Stack (1b). + */ + uint8_t label_tc_s[3]; + uint8_t ttl; /** Time-to-Live. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_MPLS. */ +#ifndef __cplusplus +static const struct rte_flow_item_mpls rte_flow_item_mpls_mask = { + .label_tc_s = "\xff\xff\xf0", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_GRE. + * + * Matches a GRE header. + */ +struct rte_flow_item_gre { + /** + * Checksum (1b), reserved 0 (12b), version (3b). + * Refer to RFC 2784. + */ + rte_be16_t c_rsvd0_ver; + rte_be16_t protocol; /**< Protocol type. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_GRE. */ +#ifndef __cplusplus +static const struct rte_flow_item_gre rte_flow_item_gre_mask = { + .protocol = RTE_BE16(0xffff), +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_FUZZY + * + * Fuzzy pattern match, expect faster than default. + * + * This is for device that support fuzzy match option. + * Usually a fuzzy match is fast but the cost is accuracy. + * i.e. Signature Match only match pattern's hash value, but it is + * possible two different patterns have the same hash value. + * + * Matching accuracy level can be configure by threshold. + * Driver can divide the range of threshold and map to different + * accuracy levels that device support. + * + * Threshold 0 means perfect match (no fuzziness), while threshold + * 0xffffffff means fuzziest match. + */ +struct rte_flow_item_fuzzy { + uint32_t thresh; /**< Accuracy threshold. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_FUZZY. */ +#ifndef __cplusplus +static const struct rte_flow_item_fuzzy rte_flow_item_fuzzy_mask = { + .thresh = 0xffffffff, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_GTP. + * + * Matches a GTPv1 header. + */ +struct rte_flow_item_gtp { + /** + * Version (3b), protocol type (1b), reserved (1b), + * Extension header flag (1b), + * Sequence number flag (1b), + * N-PDU number flag (1b). + */ + uint8_t v_pt_rsv_flags; + uint8_t msg_type; /**< Message type. */ + rte_be16_t msg_len; /**< Message length. */ + rte_be32_t teid; /**< Tunnel endpoint identifier. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_GTP. */ +#ifndef __cplusplus +static const struct rte_flow_item_gtp rte_flow_item_gtp_mask = { + .teid = RTE_BE32(0xffffffff), +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ESP + * + * Matches an ESP header. + */ +struct rte_flow_item_esp { + struct esp_hdr hdr; /**< ESP header definition. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ESP. */ +#ifndef __cplusplus +static const struct rte_flow_item_esp rte_flow_item_esp_mask = { + .hdr = { + .spi = 0xffffffff, + }, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_GENEVE. + * + * Matches a GENEVE header. + */ +struct rte_flow_item_geneve { + /** + * Version (2b), length of the options fields (6b), OAM packet (1b), + * critical options present (1b), reserved 0 (6b). + */ + rte_be16_t ver_opt_len_o_c_rsvd0; + rte_be16_t protocol; /**< Protocol type. */ + uint8_t vni[3]; /**< Virtual Network Identifier. */ + uint8_t rsvd1; /**< Reserved, normally 0x00. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_GENEVE. */ +#ifndef __cplusplus +static const struct rte_flow_item_geneve rte_flow_item_geneve_mask = { + .vni = "\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_VXLAN_GPE (draft-ietf-nvo3-vxlan-gpe-05). + * + * Matches a VXLAN-GPE header. + */ +struct rte_flow_item_vxlan_gpe { + uint8_t flags; /**< Normally 0x0c (I and P flags). */ + uint8_t rsvd0[2]; /**< Reserved, normally 0x0000. */ + uint8_t protocol; /**< Protocol type. */ + uint8_t vni[3]; /**< VXLAN identifier. */ + uint8_t rsvd1; /**< Reserved, normally 0x00. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_VXLAN_GPE. */ +#ifndef __cplusplus +static const struct rte_flow_item_vxlan_gpe rte_flow_item_vxlan_gpe_mask = { + .vni = "\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ARP_ETH_IPV4 + * + * Matches an ARP header for Ethernet/IPv4. + */ +struct rte_flow_item_arp_eth_ipv4 { + rte_be16_t hrd; /**< Hardware type, normally 1. */ + rte_be16_t pro; /**< Protocol type, normally 0x0800. */ + uint8_t hln; /**< Hardware address length, normally 6. */ + uint8_t pln; /**< Protocol address length, normally 4. */ + rte_be16_t op; /**< Opcode (1 for request, 2 for reply). */ + struct ether_addr sha; /**< Sender hardware address. */ + rte_be32_t spa; /**< Sender IPv4 address. */ + struct ether_addr tha; /**< Target hardware address. */ + rte_be32_t tpa; /**< Target IPv4 address. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP_ETH_IPV4. */ +#ifndef __cplusplus +static const struct rte_flow_item_arp_eth_ipv4 +rte_flow_item_arp_eth_ipv4_mask = { + .sha.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .spa = RTE_BE32(0xffffffff), + .tha.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .tpa = RTE_BE32(0xffffffff), +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_IPV6_EXT + * + * Matches the presence of any IPv6 extension header. + * + * Normally preceded by any of: + * + * - RTE_FLOW_ITEM_TYPE_IPV6 + * - RTE_FLOW_ITEM_TYPE_IPV6_EXT + */ +struct rte_flow_item_ipv6_ext { + uint8_t next_hdr; /**< Next header. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT. */ +#ifndef __cplusplus +static const +struct rte_flow_item_ipv6_ext rte_flow_item_ipv6_ext_mask = { + .next_hdr = 0xff, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP6 + * + * Matches any ICMPv6 header. + */ +struct rte_flow_item_icmp6 { + uint8_t type; /**< ICMPv6 type. */ + uint8_t code; /**< ICMPv6 code. */ + uint16_t checksum; /**< ICMPv6 checksum. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP6. */ +#ifndef __cplusplus +static const struct rte_flow_item_icmp6 rte_flow_item_icmp6_mask = { + .type = 0xff, + .code = 0xff, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS + * + * Matches an ICMPv6 neighbor discovery solicitation. + */ +struct rte_flow_item_icmp6_nd_ns { + uint8_t type; /**< ICMPv6 type, normally 135. */ + uint8_t code; /**< ICMPv6 code, normally 0. */ + rte_be16_t checksum; /**< ICMPv6 checksum. */ + rte_be32_t reserved; /**< Reserved, normally 0. */ + uint8_t target_addr[16]; /**< Target address. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS. */ +#ifndef __cplusplus +static const +struct rte_flow_item_icmp6_nd_ns rte_flow_item_icmp6_nd_ns_mask = { + .target_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA + * + * Matches an ICMPv6 neighbor discovery advertisement. + */ +struct rte_flow_item_icmp6_nd_na { + uint8_t type; /**< ICMPv6 type, normally 136. */ + uint8_t code; /**< ICMPv6 code, normally 0. */ + rte_be16_t checksum; /**< ICMPv6 checksum. */ + /** + * Route flag (1b), solicited flag (1b), override flag (1b), + * reserved (29b). + */ + rte_be32_t rso_reserved; + uint8_t target_addr[16]; /**< Target address. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA. */ +#ifndef __cplusplus +static const +struct rte_flow_item_icmp6_nd_na rte_flow_item_icmp6_nd_na_mask = { + .target_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT + * + * Matches the presence of any ICMPv6 neighbor discovery option. + * + * Normally preceded by any of: + * + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT + */ +struct rte_flow_item_icmp6_nd_opt { + uint8_t type; /**< ND option type. */ + uint8_t length; /**< ND option length. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT. */ +#ifndef __cplusplus +static const struct rte_flow_item_icmp6_nd_opt +rte_flow_item_icmp6_nd_opt_mask = { + .type = 0xff, +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA_ETH + * + * Matches an ICMPv6 neighbor discovery source Ethernet link-layer address + * option. + * + * Normally preceded by any of: + * + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT + */ +struct rte_flow_item_icmp6_nd_opt_sla_eth { + uint8_t type; /**< ND option type, normally 1. */ + uint8_t length; /**< ND option length, normally 1. */ + struct ether_addr sla; /**< Source Ethernet LLA. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA_ETH. */ +#ifndef __cplusplus +static const struct rte_flow_item_icmp6_nd_opt_sla_eth +rte_flow_item_icmp6_nd_opt_sla_eth_mask = { + .sla.addr_bytes = "\xff\xff\xff\xff\xff\xff", +}; +#endif + +/** + * RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA_ETH + * + * Matches an ICMPv6 neighbor discovery target Ethernet link-layer address + * option. + * + * Normally preceded by any of: + * + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS + * - RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT + */ +struct rte_flow_item_icmp6_nd_opt_tla_eth { + uint8_t type; /**< ND option type, normally 2. */ + uint8_t length; /**< ND option length, normally 1. */ + struct ether_addr tla; /**< Target Ethernet LLA. */ +}; + +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA_ETH. */ +#ifndef __cplusplus +static const struct rte_flow_item_icmp6_nd_opt_tla_eth +rte_flow_item_icmp6_nd_opt_tla_eth_mask = { + .tla.addr_bytes = "\xff\xff\xff\xff\xff\xff", +}; +#endif + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ITEM_TYPE_MARK + * + * Matches an arbitrary integer value which was set using the ``MARK`` action + * in a previously matched rule. + * + * This item can only be specified once as a match criteria as the ``MARK`` + * action can only be specified once in a flow action. + * + * This value is arbitrary and application-defined. Maximum allowed value + * depends on the underlying implementation. + * + * Depending on the underlying implementation the MARK item may be supported on + * the physical device, with virtual groups in the PMD or not at all. + */ +struct rte_flow_item_mark { + uint32_t id; /**< Integer value to match against. */ +}; + +/** + * Matching pattern item definition. + * + * A pattern is formed by stacking items starting from the lowest protocol + * layer to match. This stacking restriction does not apply to meta items + * which can be placed anywhere in the stack without affecting the meaning + * of the resulting pattern. + * + * Patterns are terminated by END items. + * + * The spec field should be a valid pointer to a structure of the related + * item type. It may remain unspecified (NULL) in many cases to request + * broad (nonspecific) matching. In such cases, last and mask must also be + * set to NULL. + * + * Optionally, last can point to a structure of the same type to define an + * inclusive range. This is mostly supported by integer and address fields, + * may cause errors otherwise. Fields that do not support ranges must be set + * to 0 or to the same value as the corresponding fields in spec. + * + * Only the fields defined to nonzero values in the default masks (see + * rte_flow_item_{name}_mask constants) are considered relevant by + * default. This can be overridden by providing a mask structure of the + * same type with applicable bits set to one. It can also be used to + * partially filter out specific fields (e.g. as an alternate mean to match + * ranges of IP addresses). + * + * Mask is a simple bit-mask applied before interpreting the contents of + * spec and last, which may yield unexpected results if not used + * carefully. For example, if for an IPv4 address field, spec provides + * 10.1.2.3, last provides 10.3.4.5 and mask provides 255.255.0.0, the + * effective range becomes 10.1.0.0 to 10.3.255.255. + */ +struct rte_flow_item { + enum rte_flow_item_type type; /**< Item type. */ + const void *spec; /**< Pointer to item specification structure. */ + const void *last; /**< Defines an inclusive range (spec to last). */ + const void *mask; /**< Bit-mask applied to spec and last. */ +}; + +/** + * Action types. + * + * Each possible action is represented by a type. Some have associated + * configuration structures. Several actions combined in a list can be + * assigned to a flow rule and are performed in order. + * + * They fall in three categories: + * + * - Actions that modify the fate of matching traffic, for instance by + * dropping or assigning it a specific destination. + * + * - Actions that modify matching traffic contents or its properties. This + * includes adding/removing encapsulation, encryption, compression and + * marks. + * + * - Actions related to the flow rule itself, such as updating counters or + * making it non-terminating. + * + * Flow rules being terminating by default, not specifying any action of the + * fate kind results in undefined behavior. This applies to both ingress and + * egress. + * + * PASSTHRU, when supported, makes a flow rule non-terminating. + */ +enum rte_flow_action_type { + /** + * End marker for action lists. Prevents further processing of + * actions, thereby ending the list. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_END, + + /** + * Used as a placeholder for convenience. It is ignored and simply + * discarded by PMDs. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_VOID, + + /** + * Leaves traffic up for additional processing by subsequent flow + * rules; makes a flow rule non-terminating. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_PASSTHRU, + + /** + * RTE_FLOW_ACTION_TYPE_JUMP + * + * Redirects packets to a group on the current device. + * + * See struct rte_flow_action_jump. + */ + RTE_FLOW_ACTION_TYPE_JUMP, + + /** + * Attaches an integer value to packets and sets PKT_RX_FDIR and + * PKT_RX_FDIR_ID mbuf flags. + * + * See struct rte_flow_action_mark. + */ + RTE_FLOW_ACTION_TYPE_MARK, + + /** + * Flags packets. Similar to MARK without a specific value; only + * sets the PKT_RX_FDIR mbuf flag. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_FLAG, + + /** + * Assigns packets to a given queue index. + * + * See struct rte_flow_action_queue. + */ + RTE_FLOW_ACTION_TYPE_QUEUE, + + /** + * Drops packets. + * + * PASSTHRU overrides this action if both are specified. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_DROP, + + /** + * Enables counters for this flow rule. + * + * These counters can be retrieved and reset through rte_flow_query(), + * see struct rte_flow_query_count. + * + * See struct rte_flow_action_count. + */ + RTE_FLOW_ACTION_TYPE_COUNT, + + /** + * Similar to QUEUE, except RSS is additionally performed on packets + * to spread them among several queues according to the provided + * parameters. + * + * See struct rte_flow_action_rss. + */ + RTE_FLOW_ACTION_TYPE_RSS, + + /** + * Directs matching traffic to the physical function (PF) of the + * current device. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_PF, + + /** + * Directs matching traffic to a given virtual function of the + * current device. + * + * See struct rte_flow_action_vf. + */ + RTE_FLOW_ACTION_TYPE_VF, + + /** + * Directs packets to a given physical port index of the underlying + * device. + * + * See struct rte_flow_action_phy_port. + */ + RTE_FLOW_ACTION_TYPE_PHY_PORT, + + /** + * Directs matching traffic to a given DPDK port ID. + * + * See struct rte_flow_action_port_id. + */ + RTE_FLOW_ACTION_TYPE_PORT_ID, + + /** + * Traffic metering and policing (MTR). + * + * See struct rte_flow_action_meter. + * See file rte_mtr.h for MTR object configuration. + */ + RTE_FLOW_ACTION_TYPE_METER, + + /** + * Redirects packets to security engine of current device for security + * processing as specified by security session. + * + * See struct rte_flow_action_security. + */ + RTE_FLOW_ACTION_TYPE_SECURITY, + + /** + * Implements OFPAT_SET_MPLS_TTL ("MPLS TTL") as defined by the + * OpenFlow Switch Specification. + * + * See struct rte_flow_action_of_set_mpls_ttl. + */ + RTE_FLOW_ACTION_TYPE_OF_SET_MPLS_TTL, + + /** + * Implements OFPAT_DEC_MPLS_TTL ("decrement MPLS TTL") as defined + * by the OpenFlow Switch Specification. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_OF_DEC_MPLS_TTL, + + /** + * Implements OFPAT_SET_NW_TTL ("IP TTL") as defined by the OpenFlow + * Switch Specification. + * + * See struct rte_flow_action_of_set_nw_ttl. + */ + RTE_FLOW_ACTION_TYPE_OF_SET_NW_TTL, + + /** + * Implements OFPAT_DEC_NW_TTL ("decrement IP TTL") as defined by + * the OpenFlow Switch Specification. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_OF_DEC_NW_TTL, + + /** + * Implements OFPAT_COPY_TTL_OUT ("copy TTL "outwards" -- from + * next-to-outermost to outermost") as defined by the OpenFlow + * Switch Specification. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_OF_COPY_TTL_OUT, + + /** + * Implements OFPAT_COPY_TTL_IN ("copy TTL "inwards" -- from + * outermost to next-to-outermost") as defined by the OpenFlow + * Switch Specification. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_OF_COPY_TTL_IN, + + /** + * Implements OFPAT_POP_VLAN ("pop the outer VLAN tag") as defined + * by the OpenFlow Switch Specification. + * + * No associated configuration structure. + */ + RTE_FLOW_ACTION_TYPE_OF_POP_VLAN, + + /** + * Implements OFPAT_PUSH_VLAN ("push a new VLAN tag") as defined by + * the OpenFlow Switch Specification. + * + * See struct rte_flow_action_of_push_vlan. + */ + RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN, + + /** + * Implements OFPAT_SET_VLAN_VID ("set the 802.1q VLAN id") as + * defined by the OpenFlow Switch Specification. + * + * See struct rte_flow_action_of_set_vlan_vid. + */ + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID, + + /** + * Implements OFPAT_SET_LAN_PCP ("set the 802.1q priority") as + * defined by the OpenFlow Switch Specification. + * + * See struct rte_flow_action_of_set_vlan_pcp. + */ + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP, + + /** + * Implements OFPAT_POP_MPLS ("pop the outer MPLS tag") as defined + * by the OpenFlow Switch Specification. + * + * See struct rte_flow_action_of_pop_mpls. + */ + RTE_FLOW_ACTION_TYPE_OF_POP_MPLS, + + /** + * Implements OFPAT_PUSH_MPLS ("push a new MPLS tag") as defined by + * the OpenFlow Switch Specification. + * + * See struct rte_flow_action_of_push_mpls. + */ + RTE_FLOW_ACTION_TYPE_OF_PUSH_MPLS, + + /** + * Encapsulate flow in VXLAN tunnel as defined in + * rte_flow_action_vxlan_encap action structure. + * + * See struct rte_flow_action_vxlan_encap. + */ + RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP, + + /** + * Decapsulate outer most VXLAN tunnel from matched flow. + * + * If flow pattern does not define a valid VXLAN tunnel (as specified by + * RFC7348) then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION + * error. + */ + RTE_FLOW_ACTION_TYPE_VXLAN_DECAP, + + /** + * Encapsulate flow in NVGRE tunnel defined in the + * rte_flow_action_nvgre_encap action structure. + * + * See struct rte_flow_action_nvgre_encap. + */ + RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP, + + /** + * Decapsulate outer most NVGRE tunnel from matched flow. + * + * If flow pattern does not define a valid NVGRE tunnel (as specified by + * RFC7637) then the PMD should return a RTE_FLOW_ERROR_TYPE_ACTION + * error. + */ + RTE_FLOW_ACTION_TYPE_NVGRE_DECAP, +}; + +/** + * RTE_FLOW_ACTION_TYPE_MARK + * + * Attaches an integer value to packets and sets PKT_RX_FDIR and + * PKT_RX_FDIR_ID mbuf flags. + * + * This value is arbitrary and application-defined. Maximum allowed value + * depends on the underlying implementation. It is returned in the + * hash.fdir.hi mbuf field. + */ +struct rte_flow_action_mark { + uint32_t id; /**< Integer value to return with packets. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_JUMP + * + * Redirects packets to a group on the current device. + * + * In a hierarchy of groups, which can be used to represent physical or logical + * flow tables on the device, this action allows the action to be a redirect to + * a group on that device. + */ +struct rte_flow_action_jump { + uint32_t group; +}; + +/** + * RTE_FLOW_ACTION_TYPE_QUEUE + * + * Assign packets to a given queue index. + */ +struct rte_flow_action_queue { + uint16_t index; /**< Queue index to use. */ +}; + + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_COUNT + * + * Adds a counter action to a matched flow. + * + * If more than one count action is specified in a single flow rule, then each + * action must specify a unique id. + * + * Counters can be retrieved and reset through ``rte_flow_query()``, see + * ``struct rte_flow_query_count``. + * + * The shared flag indicates whether the counter is unique to the flow rule the + * action is specified with, or whether it is a shared counter. + * + * For a count action with the shared flag set, then then a global device + * namespace is assumed for the counter id, so that any matched flow rules using + * a count action with the same counter id on the same port will contribute to + * that counter. + * + * For ports within the same switch domain then the counter id namespace extends + * to all ports within that switch domain. + */ +struct rte_flow_action_count { + uint32_t shared:1; /**< Share counter ID with other flow rules. */ + uint32_t reserved:31; /**< Reserved, must be zero. */ + uint32_t id; /**< Counter ID. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_COUNT (query) + * + * Query structure to retrieve and reset flow rule counters. + */ +struct rte_flow_query_count { + uint32_t reset:1; /**< Reset counters after query [in]. */ + uint32_t hits_set:1; /**< hits field is set [out]. */ + uint32_t bytes_set:1; /**< bytes field is set [out]. */ + uint32_t reserved:29; /**< Reserved, must be zero [in, out]. */ + uint64_t hits; /**< Number of hits for this rule [out]. */ + uint64_t bytes; /**< Number of bytes through this rule [out]. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_RSS + * + * Similar to QUEUE, except RSS is additionally performed on packets to + * spread them among several queues according to the provided parameters. + * + * Unlike global RSS settings used by other DPDK APIs, unsetting the + * @p types field does not disable RSS in a flow rule. Doing so instead + * requests safe unspecified "best-effort" settings from the underlying PMD, + * which depending on the flow rule, may result in anything ranging from + * empty (single queue) to all-inclusive RSS. + * + * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps + * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only, + * both can be requested simultaneously. + */ +struct rte_flow_action_rss { + enum rte_eth_hash_function func; /**< RSS hash function to apply. */ + /** + * Packet encapsulation level RSS hash @p types apply to. + * + * - @p 0 requests the default behavior. Depending on the packet + * type, it can mean outermost, innermost, anything in between or + * even no RSS. + * + * It basically stands for the innermost encapsulation level RSS + * can be performed on according to PMD and device capabilities. + * + * - @p 1 requests RSS to be performed on the outermost packet + * encapsulation level. + * + * - @p 2 and subsequent values request RSS to be performed on the + * specified inner packet encapsulation level, from outermost to + * innermost (lower to higher values). + * + * Values other than @p 0 are not necessarily supported. + * + * Requesting a specific RSS level on unrecognized traffic results + * in undefined behavior. For predictable results, it is recommended + * to make the flow rule pattern match packet headers up to the + * requested encapsulation level so that only matching traffic goes + * through. + */ + uint32_t level; + uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */ + uint32_t key_len; /**< Hash key length in bytes. */ + uint32_t queue_num; /**< Number of entries in @p queue. */ + const uint8_t *key; /**< Hash key. */ + const uint16_t *queue; /**< Queue indices to use. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_VF + * + * Directs matching traffic to a given virtual function of the current + * device. + * + * Packets matched by a VF pattern item can be redirected to their original + * VF ID instead of the specified one. This parameter may not be available + * and is not guaranteed to work properly if the VF part is matched by a + * prior flow rule or if packets are not addressed to a VF in the first + * place. + */ +struct rte_flow_action_vf { + uint32_t original:1; /**< Use original VF ID if possible. */ + uint32_t reserved:31; /**< Reserved, must be zero. */ + uint32_t id; /**< VF ID. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_PHY_PORT + * + * Directs packets to a given physical port index of the underlying + * device. + * + * @see RTE_FLOW_ITEM_TYPE_PHY_PORT + */ +struct rte_flow_action_phy_port { + uint32_t original:1; /**< Use original port index if possible. */ + uint32_t reserved:31; /**< Reserved, must be zero. */ + uint32_t index; /**< Physical port index. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_PORT_ID + * + * Directs matching traffic to a given DPDK port ID. + * + * @see RTE_FLOW_ITEM_TYPE_PORT_ID + */ +struct rte_flow_action_port_id { + uint32_t original:1; /**< Use original DPDK port ID if possible. */ + uint32_t reserved:31; /**< Reserved, must be zero. */ + uint32_t id; /**< DPDK port ID. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_METER + * + * Traffic metering and policing (MTR). + * + * Packets matched by items of this type can be either dropped or passed to the + * next item with their color set by the MTR object. + */ +struct rte_flow_action_meter { + uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_SECURITY + * + * Perform the security action on flows matched by the pattern items + * according to the configuration of the security session. + * + * This action modifies the payload of matched flows. For INLINE_CRYPTO, the + * security protocol headers and IV are fully provided by the application as + * specified in the flow pattern. The payload of matching packets is + * encrypted on egress, and decrypted and authenticated on ingress. + * For INLINE_PROTOCOL, the security protocol is fully offloaded to HW, + * providing full encapsulation and decapsulation of packets in security + * protocols. The flow pattern specifies both the outer security header fields + * and the inner packet fields. The security session specified in the action + * must match the pattern parameters. + * + * The security session specified in the action must be created on the same + * port as the flow action that is being specified. + * + * The ingress/egress flow attribute should match that specified in the + * security session if the security session supports the definition of the + * direction. + * + * Multiple flows can be configured to use the same security session. + */ +struct rte_flow_action_security { + void *security_session; /**< Pointer to security session structure. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_SET_MPLS_TTL + * + * Implements OFPAT_SET_MPLS_TTL ("MPLS TTL") as defined by the OpenFlow + * Switch Specification. + */ +struct rte_flow_action_of_set_mpls_ttl { + uint8_t mpls_ttl; /**< MPLS TTL. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_SET_NW_TTL + * + * Implements OFPAT_SET_NW_TTL ("IP TTL") as defined by the OpenFlow Switch + * Specification. + */ +struct rte_flow_action_of_set_nw_ttl { + uint8_t nw_ttl; /**< IP TTL. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN + * + * Implements OFPAT_PUSH_VLAN ("push a new VLAN tag") as defined by the + * OpenFlow Switch Specification. + */ +struct rte_flow_action_of_push_vlan { + rte_be16_t ethertype; /**< EtherType. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID + * + * Implements OFPAT_SET_VLAN_VID ("set the 802.1q VLAN id") as defined by + * the OpenFlow Switch Specification. + */ +struct rte_flow_action_of_set_vlan_vid { + rte_be16_t vlan_vid; /**< VLAN id. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP + * + * Implements OFPAT_SET_LAN_PCP ("set the 802.1q priority") as defined by + * the OpenFlow Switch Specification. + */ +struct rte_flow_action_of_set_vlan_pcp { + uint8_t vlan_pcp; /**< VLAN priority. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_POP_MPLS + * + * Implements OFPAT_POP_MPLS ("pop the outer MPLS tag") as defined by the + * OpenFlow Switch Specification. + */ +struct rte_flow_action_of_pop_mpls { + rte_be16_t ethertype; /**< EtherType. */ +}; + +/** + * RTE_FLOW_ACTION_TYPE_OF_PUSH_MPLS + * + * Implements OFPAT_PUSH_MPLS ("push a new MPLS tag") as defined by the + * OpenFlow Switch Specification. + */ +struct rte_flow_action_of_push_mpls { + rte_be16_t ethertype; /**< EtherType. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP + * + * VXLAN tunnel end-point encapsulation data definition + * + * The tunnel definition is provided through the flow item pattern, the + * provided pattern must conform to RFC7348 for the tunnel specified. The flow + * definition must be provided in order from the RTE_FLOW_ITEM_TYPE_ETH + * definition up the end item which is specified by RTE_FLOW_ITEM_TYPE_END. + * + * The mask field allows user to specify which fields in the flow item + * definitions can be ignored and which have valid data and can be used + * verbatim. + * + * Note: the last field is not used in the definition of a tunnel and can be + * ignored. + * + * Valid flow definition for RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP include: + * + * - ETH / IPV4 / UDP / VXLAN / END + * - ETH / IPV6 / UDP / VXLAN / END + * - ETH / VLAN / IPV4 / UDP / VXLAN / END + * + */ +struct rte_flow_action_vxlan_encap { + /** + * Encapsulating vxlan tunnel definition + * (terminated by the END pattern item). + */ + struct rte_flow_item *definition; +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP + * + * NVGRE tunnel end-point encapsulation data definition + * + * The tunnel definition is provided through the flow item pattern the + * provided pattern must conform with RFC7637. The flow definition must be + * provided in order from the RTE_FLOW_ITEM_TYPE_ETH definition up the end item + * which is specified by RTE_FLOW_ITEM_TYPE_END. + * + * The mask field allows user to specify which fields in the flow item + * definitions can be ignored and which have valid data and can be used + * verbatim. + * + * Note: the last field is not used in the definition of a tunnel and can be + * ignored. + * + * Valid flow definition for RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP include: + * + * - ETH / IPV4 / NVGRE / END + * - ETH / VLAN / IPV6 / NVGRE / END + * + */ +struct rte_flow_action_nvgre_encap { + /** + * Encapsulating vxlan tunnel definition + * (terminated by the END pattern item). + */ + struct rte_flow_item *definition; +}; + +/* + * Definition of a single action. + * + * A list of actions is terminated by a END action. + * + * For simple actions without a configuration structure, conf remains NULL. + */ +struct rte_flow_action { + enum rte_flow_action_type type; /**< Action type. */ + const void *conf; /**< Pointer to action configuration structure. */ +}; + +/** + * Opaque type returned after successfully creating a flow. + * + * This handle can be used to manage and query the related flow (e.g. to + * destroy it or retrieve counters). + */ +struct rte_flow; + +/** + * Verbose error types. + * + * Most of them provide the type of the object referenced by struct + * rte_flow_error.cause. + */ +enum rte_flow_error_type { + RTE_FLOW_ERROR_TYPE_NONE, /**< No error. */ + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, /**< Cause unspecified. */ + RTE_FLOW_ERROR_TYPE_HANDLE, /**< Flow rule (handle). */ + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, /**< Group field. */ + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, /**< Priority field. */ + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, /**< Ingress field. */ + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */ + RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, /**< Transfer field. */ + RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */ + RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */ + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */ + RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */ + RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */ + RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */ + RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */ + RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */ + RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */ +}; + +/** + * Verbose error structure definition. + * + * This object is normally allocated by applications and set by PMDs, the + * message points to a constant string which does not need to be freed by + * the application, however its pointer can be considered valid only as long + * as its associated DPDK port remains configured. Closing the underlying + * device or unloading the PMD invalidates it. + * + * Both cause and message may be NULL regardless of the error type. + */ +struct rte_flow_error { + enum rte_flow_error_type type; /**< Cause field and error types. */ + const void *cause; /**< Object responsible for the error. */ + const char *message; /**< Human-readable error message. */ +}; + +/** + * Check whether a flow rule can be created on a given port. + * + * The flow rule is validated for correctness and whether it could be accepted + * by the device given sufficient resources. The rule is checked against the + * current device mode and queue configuration. The flow rule may also + * optionally be validated against existing flow rules and device resources. + * This function has no effect on the target device. + * + * The returned value is guaranteed to remain valid only as long as no + * successful calls to rte_flow_create() or rte_flow_destroy() are made in + * the meantime and no device parameter affecting flow rules in any way are + * modified, due to possible collisions or resource limitations (although in + * such cases EINVAL should not be returned). + * + * @param port_id + * Port identifier of Ethernet device. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * + * @return + * 0 if flow rule is valid and can be created. A negative errno value + * otherwise (rte_errno is also set), the following errors are defined: + * + * -ENOSYS: underlying device does not support this functionality. + * + * -EIO: underlying device is removed. + * + * -EINVAL: unknown or invalid rule specification. + * + * -ENOTSUP: valid but unsupported rule specification (e.g. partial + * bit-masks are unsupported). + * + * -EEXIST: collision with an existing rule. Only returned if device + * supports flow rule collision checking and there was a flow rule + * collision. Not receiving this return code is no guarantee that creating + * the rule will not fail due to a collision. + * + * -ENOMEM: not enough memory to execute the function, or if the device + * supports resource validation, resource limitation on the device. + * + * -EBUSY: action cannot be performed due to busy device resources, may + * succeed if the affected queues or even the entire port are in a stopped + * state (see rte_eth_dev_rx_queue_stop() and rte_eth_dev_stop()). + */ +int +rte_flow_validate(uint16_t port_id, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +/** + * Create a flow rule on a given port. + * + * @param port_id + * Port identifier of Ethernet device. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * + * @return + * A valid handle in case of success, NULL otherwise and rte_errno is set + * to the positive version of one of the error codes defined for + * rte_flow_validate(). + */ +struct rte_flow * +rte_flow_create(uint16_t port_id, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +/** + * Destroy a flow rule on a given port. + * + * Failure to destroy a flow rule handle may occur when other flow rules + * depend on it, and destroying it would result in an inconsistent state. + * + * This function is only guaranteed to succeed if handles are destroyed in + * reverse order of their creation. + * + * @param port_id + * Port identifier of Ethernet device. + * @param flow + * Flow rule handle to destroy. + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +rte_flow_destroy(uint16_t port_id, + struct rte_flow *flow, + struct rte_flow_error *error); + +/** + * Destroy all flow rules associated with a port. + * + * In the unlikely event of failure, handles are still considered destroyed + * and no longer valid but the port must be assumed to be in an inconsistent + * state. + * + * @param port_id + * Port identifier of Ethernet device. + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +rte_flow_flush(uint16_t port_id, + struct rte_flow_error *error); + +/** + * Query an existing flow rule. + * + * This function allows retrieving flow-specific data such as counters. + * Data is gathered by special actions which must be present in the flow + * rule definition. + * + * \see RTE_FLOW_ACTION_TYPE_COUNT + * + * @param port_id + * Port identifier of Ethernet device. + * @param flow + * Flow rule handle to query. + * @param action + * Action definition as defined in original flow rule. + * @param[in, out] data + * Pointer to storage for the associated query data type. + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +rte_flow_query(uint16_t port_id, + struct rte_flow *flow, + const struct rte_flow_action *action, + void *data, + struct rte_flow_error *error); + +/** + * Restrict ingress traffic to the defined flow rules. + * + * Isolated mode guarantees that all ingress traffic comes from defined flow + * rules only (current and future). + * + * Besides making ingress more deterministic, it allows PMDs to safely reuse + * resources otherwise assigned to handle the remaining traffic, such as + * global RSS configuration settings, VLAN filters, MAC address entries, + * legacy filter API rules and so on in order to expand the set of possible + * flow rule types. + * + * Calling this function as soon as possible after device initialization, + * ideally before the first call to rte_eth_dev_configure(), is recommended + * to avoid possible failures due to conflicting settings. + * + * Once effective, leaving isolated mode may not be possible depending on + * PMD implementation. + * + * Additionally, the following functionality has no effect on the underlying + * port and may return errors such as ENOTSUP ("not supported"): + * + * - Toggling promiscuous mode. + * - Toggling allmulticast mode. + * - Configuring MAC addresses. + * - Configuring multicast addresses. + * - Configuring VLAN filters. + * - Configuring Rx filters through the legacy API (e.g. FDIR). + * - Configuring global RSS settings. + * + * @param port_id + * Port identifier of Ethernet device. + * @param set + * Nonzero to enter isolated mode, attempt to leave it otherwise. + * @param[out] error + * Perform verbose error reporting if not NULL. PMDs initialize this + * structure in case of error only. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +rte_flow_isolate(uint16_t port_id, int set, struct rte_flow_error *error); + +/** + * Initialize flow error structure. + * + * @param[out] error + * Pointer to flow error structure (may be NULL). + * @param code + * Related error code (rte_errno). + * @param type + * Cause field and error types. + * @param cause + * Object responsible for the error. + * @param message + * Human-readable error message. + * + * @return + * Negative error code (errno value) and rte_errno is set. + */ +int +rte_flow_error_set(struct rte_flow_error *error, + int code, + enum rte_flow_error_type type, + const void *cause, + const char *message); + +/** + * Generic flow representation. + * + * This form is sufficient to describe an rte_flow independently from any + * PMD implementation and allows for replayability and identification. + */ +struct rte_flow_desc { + size_t size; /**< Allocated space including data[]. */ + struct rte_flow_attr attr; /**< Attributes. */ + struct rte_flow_item *items; /**< Items. */ + struct rte_flow_action *actions; /**< Actions. */ + uint8_t data[]; /**< Storage for items/actions. */ +}; + +/** + * Copy an rte_flow rule description. + * + * @param[in] fd + * Flow rule description. + * @param[in] len + * Total size of allocated data for the flow description. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * + * @return + * If len is greater or equal to the size of the flow, the total size of the + * flow description and its data. + * If len is lower than the size of the flow, the number of bytes that would + * have been written to desc had it been sufficient. Nothing is written. + */ +size_t +rte_flow_copy(struct rte_flow_desc *fd, size_t len, + const struct rte_flow_attr *attr, + const struct rte_flow_item *items, + const struct rte_flow_action *actions); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_FLOW_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_flow_driver.h b/src/spdk/dpdk/lib/librte_ethdev/rte_flow_driver.h new file mode 100644 index 00000000..688f7230 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_flow_driver.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd + */ + +#ifndef RTE_FLOW_DRIVER_H_ +#define RTE_FLOW_DRIVER_H_ + +/** + * @file + * RTE generic flow API (driver side) + * + * This file provides implementation helpers for internal use by PMDs, they + * are not intended to be exposed to applications and are not subject to ABI + * versioning. + */ + +#include + +#include "rte_ethdev.h" +#include "rte_flow.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Generic flow operations structure implemented and returned by PMDs. + * + * To implement this API, PMDs must handle the RTE_ETH_FILTER_GENERIC filter + * type in their .filter_ctrl callback function (struct eth_dev_ops) as well + * as the RTE_ETH_FILTER_GET filter operation. + * + * If successful, this operation must result in a pointer to a PMD-specific + * struct rte_flow_ops written to the argument address as described below: + * + * \code + * + * // PMD filter_ctrl callback + * + * static const struct rte_flow_ops pmd_flow_ops = { ... }; + * + * switch (filter_type) { + * case RTE_ETH_FILTER_GENERIC: + * if (filter_op != RTE_ETH_FILTER_GET) + * return -EINVAL; + * *(const void **)arg = &pmd_flow_ops; + * return 0; + * } + * + * \endcode + * + * See also rte_flow_ops_get(). + * + * These callback functions are not supposed to be used by applications + * directly, which must rely on the API defined in rte_flow.h. + * + * Public-facing wrapper functions perform a few consistency checks so that + * unimplemented (i.e. NULL) callbacks simply return -ENOTSUP. These + * callbacks otherwise only differ by their first argument (with port ID + * already resolved to a pointer to struct rte_eth_dev). + */ +struct rte_flow_ops { + /** See rte_flow_validate(). */ + int (*validate) + (struct rte_eth_dev *, + const struct rte_flow_attr *, + const struct rte_flow_item [], + const struct rte_flow_action [], + struct rte_flow_error *); + /** See rte_flow_create(). */ + struct rte_flow *(*create) + (struct rte_eth_dev *, + const struct rte_flow_attr *, + const struct rte_flow_item [], + const struct rte_flow_action [], + struct rte_flow_error *); + /** See rte_flow_destroy(). */ + int (*destroy) + (struct rte_eth_dev *, + struct rte_flow *, + struct rte_flow_error *); + /** See rte_flow_flush(). */ + int (*flush) + (struct rte_eth_dev *, + struct rte_flow_error *); + /** See rte_flow_query(). */ + int (*query) + (struct rte_eth_dev *, + struct rte_flow *, + const struct rte_flow_action *, + void *, + struct rte_flow_error *); + /** See rte_flow_isolate(). */ + int (*isolate) + (struct rte_eth_dev *, + int, + struct rte_flow_error *); +}; + +/** + * Get generic flow operations structure from a port. + * + * @param port_id + * Port identifier to query. + * @param[out] error + * Pointer to flow error structure. + * + * @return + * The flow operations structure associated with port_id, NULL in case of + * error, in which case rte_errno is set and the error structure contains + * additional details. + */ +const struct rte_flow_ops * +rte_flow_ops_get(uint16_t port_id, struct rte_flow_error *error); + +/** Helper macro to build input graph for rte_flow_expand_rss(). */ +#define RTE_FLOW_EXPAND_RSS_NEXT(...) \ + (const int []){ \ + __VA_ARGS__, 0, \ + } + +/** Node object of input graph for rte_flow_expand_rss(). */ +struct rte_flow_expand_node { + const int *const next; + /**< + * List of next node indexes. Index 0 is interpreted as a terminator. + */ + const enum rte_flow_item_type type; + /**< Pattern item type of current node. */ + uint64_t rss_types; + /**< + * RSS types bit-field associated with this node + * (see ETH_RSS_* definitions). + */ +}; + +/** Object returned by rte_flow_expand_rss(). */ +struct rte_flow_expand_rss { + uint32_t entries; + /**< Number of entries @p patterns and @p priorities. */ + struct { + struct rte_flow_item *pattern; /**< Expanded pattern array. */ + uint32_t priority; /**< Priority offset for each expansion. */ + } entry[]; +}; + +/** + * Expand RSS flows into several possible flows according to the RSS hash + * fields requested and the driver capabilities. + * + * @b EXPERIMENTAL: this API may change without prior notice + * + * @param[out] buf + * Buffer to store the result expansion. + * @param[in] size + * Buffer size in bytes. If 0, @p buf can be NULL. + * @param[in] pattern + * User flow pattern. + * @param[in] types + * RSS types to expand (see ETH_RSS_* definitions). + * @param[in] graph + * Input graph to expand @p pattern according to @p types. + * @param[in] graph_root_index + * Index of root node in @p graph, typically 0. + * + * @return + * A positive value representing the size of @p buf in bytes regardless of + * @p size on success, a negative errno value otherwise and rte_errno is + * set, the following errors are defined: + * + * -E2BIG: graph-depth @p graph is too deep. + */ +int __rte_experimental +rte_flow_expand_rss(struct rte_flow_expand_rss *buf, size_t size, + const struct rte_flow_item *pattern, uint64_t types, + const struct rte_flow_expand_node graph[], + int graph_root_index); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_FLOW_DRIVER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_mtr.c b/src/spdk/dpdk/lib/librte_ethdev/rte_mtr.c new file mode 100644 index 00000000..1046cb5f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_mtr.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include + +#include +#include "rte_compat.h" +#include "rte_ethdev.h" +#include "rte_mtr_driver.h" +#include "rte_mtr.h" + +/* Get generic traffic metering & policing operations structure from a port. */ +const struct rte_mtr_ops * +rte_mtr_ops_get(uint16_t port_id, struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_mtr_ops *ops; + + if (!rte_eth_dev_is_valid_port(port_id)) { + rte_mtr_error_set(error, + ENODEV, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, + NULL, + rte_strerror(ENODEV)); + return NULL; + } + + if ((dev->dev_ops->mtr_ops_get == NULL) || + (dev->dev_ops->mtr_ops_get(dev, &ops) != 0) || + (ops == NULL)) { + rte_mtr_error_set(error, + ENOSYS, + RTE_MTR_ERROR_TYPE_UNSPECIFIED, + NULL, + rte_strerror(ENOSYS)); + return NULL; + } + + return ops; +} + +#define RTE_MTR_FUNC(port_id, func) \ +({ \ + const struct rte_mtr_ops *ops = \ + rte_mtr_ops_get(port_id, error); \ + if (ops == NULL) \ + return -rte_errno; \ + \ + if (ops->func == NULL) \ + return -rte_mtr_error_set(error, \ + ENOSYS, \ + RTE_MTR_ERROR_TYPE_UNSPECIFIED, \ + NULL, \ + rte_strerror(ENOSYS)); \ + \ + ops->func; \ +}) + +/* MTR capabilities get */ +int __rte_experimental +rte_mtr_capabilities_get(uint16_t port_id, + struct rte_mtr_capabilities *cap, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, capabilities_get)(dev, + cap, error); +} + +/* MTR meter profile add */ +int __rte_experimental +rte_mtr_meter_profile_add(uint16_t port_id, + uint32_t meter_profile_id, + struct rte_mtr_meter_profile *profile, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, meter_profile_add)(dev, + meter_profile_id, profile, error); +} + +/** MTR meter profile delete */ +int __rte_experimental +rte_mtr_meter_profile_delete(uint16_t port_id, + uint32_t meter_profile_id, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, meter_profile_delete)(dev, + meter_profile_id, error); +} + +/** MTR object create */ +int __rte_experimental +rte_mtr_create(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_params *params, + int shared, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, create)(dev, + mtr_id, params, shared, error); +} + +/** MTR object destroy */ +int __rte_experimental +rte_mtr_destroy(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, destroy)(dev, + mtr_id, error); +} + +/** MTR object meter enable */ +int __rte_experimental +rte_mtr_meter_enable(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, meter_enable)(dev, + mtr_id, error); +} + +/** MTR object meter disable */ +int __rte_experimental +rte_mtr_meter_disable(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, meter_disable)(dev, + mtr_id, error); +} + +/** MTR object meter profile update */ +int __rte_experimental +rte_mtr_meter_profile_update(uint16_t port_id, + uint32_t mtr_id, + uint32_t meter_profile_id, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, meter_profile_update)(dev, + mtr_id, meter_profile_id, error); +} + +/** MTR object meter DSCP table update */ +int __rte_experimental +rte_mtr_meter_dscp_table_update(uint16_t port_id, + uint32_t mtr_id, + enum rte_mtr_color *dscp_table, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, meter_dscp_table_update)(dev, + mtr_id, dscp_table, error); +} + +/** MTR object policer action update */ +int __rte_experimental +rte_mtr_policer_actions_update(uint16_t port_id, + uint32_t mtr_id, + uint32_t action_mask, + enum rte_mtr_policer_action *actions, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, policer_actions_update)(dev, + mtr_id, action_mask, actions, error); +} + +/** MTR object enabled stats update */ +int __rte_experimental +rte_mtr_stats_update(uint16_t port_id, + uint32_t mtr_id, + uint64_t stats_mask, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, stats_update)(dev, + mtr_id, stats_mask, error); +} + +/** MTR object stats read */ +int __rte_experimental +rte_mtr_stats_read(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_mtr_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_MTR_FUNC(port_id, stats_read)(dev, + mtr_id, stats, stats_mask, clear, error); +} diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_mtr.h b/src/spdk/dpdk/lib/librte_ethdev/rte_mtr.h new file mode 100644 index 00000000..c4819b27 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_mtr.h @@ -0,0 +1,730 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 Intel Corporation + * Copyright 2017 NXP + * Copyright 2017 Cavium + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __INCLUDE_RTE_MTR_H__ +#define __INCLUDE_RTE_MTR_H__ + +/** + * @file + * RTE Generic Traffic Metering and Policing API + * + * This interface provides the ability to configure the traffic metering and + * policing (MTR) in a generic way. + * + * The processing done for each input packet hitting a MTR object is: + * A) Traffic metering: The packet is assigned a color (the meter output + * color), based on the previous history of the flow reflected in the + * current state of the MTR object, according to the specific traffic + * metering algorithm. The traffic metering algorithm can typically work + * in color aware mode, in which case the input packet already has an + * initial color (the input color), or in color blind mode, which is + * equivalent to considering all input packets initially colored as green. + * B) Policing: There is a separate policer action configured for each meter + * output color, which can: + * a) Drop the packet. + * b) Keep the same packet color: the policer output color matches the + * meter output color (essentially a no-op action). + * c) Recolor the packet: the policer output color is different than + * the meter output color. + * The policer output color is the output color of the packet, which is + * set in the packet meta-data (i.e. struct rte_mbuf::sched::color). + * C) Statistics: The set of counters maintained for each MTR object is + * configurable and subject to the implementation support. This set + * includes the number of packets and bytes dropped or passed for each + * output color. + * + * Once successfully created, an MTR object is linked to one or several flows + * through the meter action of the flow API. + * A) Whether an MTR object is private to a flow or potentially shared by + * several flows has to be specified at creation time. + * B) Several meter actions can be potentially registered for the same flow. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Color + */ +enum rte_mtr_color { + RTE_MTR_GREEN = 0, /**< Green */ + RTE_MTR_YELLOW, /**< Yellow */ + RTE_MTR_RED, /**< Red */ + RTE_MTR_COLORS /**< Number of colors. */ +}; + +/** + * Statistics counter type + */ +enum rte_mtr_stats_type { + /** Number of packets passed as green by the policer. */ + RTE_MTR_STATS_N_PKTS_GREEN = 1 << 0, + + /** Number of packets passed as yellow by the policer. */ + RTE_MTR_STATS_N_PKTS_YELLOW = 1 << 1, + + /** Number of packets passed as red by the policer. */ + RTE_MTR_STATS_N_PKTS_RED = 1 << 2, + + /** Number of packets dropped by the policer. */ + RTE_MTR_STATS_N_PKTS_DROPPED = 1 << 3, + + /** Number of bytes passed as green by the policer. */ + RTE_MTR_STATS_N_BYTES_GREEN = 1 << 4, + + /** Number of bytes passed as yellow by the policer. */ + RTE_MTR_STATS_N_BYTES_YELLOW = 1 << 5, + + /** Number of bytes passed as red by the policer. */ + RTE_MTR_STATS_N_BYTES_RED = 1 << 6, + + /** Number of bytes dropped by the policer. */ + RTE_MTR_STATS_N_BYTES_DROPPED = 1 << 7, +}; + +/** + * Statistics counters + */ +struct rte_mtr_stats { + /** Number of packets passed by the policer (per color). */ + uint64_t n_pkts[RTE_MTR_COLORS]; + + /** Number of bytes passed by the policer (per color). */ + uint64_t n_bytes[RTE_MTR_COLORS]; + + /** Number of packets dropped by the policer. */ + uint64_t n_pkts_dropped; + + /** Number of bytes passed by the policer. */ + uint64_t n_bytes_dropped; +}; + +/** + * Traffic metering algorithms + */ +enum rte_mtr_algorithm { + /** No traffic metering performed, the output color is the same as the + * input color for every input packet. The meter of the MTR object is + * working in pass-through mode, having same effect as meter disable. + * @see rte_mtr_meter_disable() + */ + RTE_MTR_NONE = 0, + + /** Single Rate Three Color Marker (srTCM) - IETF RFC 2697. */ + RTE_MTR_SRTCM_RFC2697, + + /** Two Rate Three Color Marker (trTCM) - IETF RFC 2698. */ + RTE_MTR_TRTCM_RFC2698, + + /** Two Rate Three Color Marker (trTCM) - IETF RFC 4115. */ + RTE_MTR_TRTCM_RFC4115, +}; + +/** + * Meter profile + */ +struct rte_mtr_meter_profile { + /** Traffic metering algorithm. */ + enum rte_mtr_algorithm alg; + + RTE_STD_C11 + union { + /** Items only valid when *alg* is set to srTCM - RFC 2697. */ + struct { + /** Committed Information Rate (CIR) (bytes/second). */ + uint64_t cir; + + /** Committed Burst Size (CBS) (bytes). */ + uint64_t cbs; + + /** Excess Burst Size (EBS) (bytes). */ + uint64_t ebs; + } srtcm_rfc2697; + + /** Items only valid when *alg* is set to trTCM - RFC 2698. */ + struct { + /** Committed Information Rate (CIR) (bytes/second). */ + uint64_t cir; + + /** Peak Information Rate (PIR) (bytes/second). */ + uint64_t pir; + + /** Committed Burst Size (CBS) (byes). */ + uint64_t cbs; + + /** Peak Burst Size (PBS) (bytes). */ + uint64_t pbs; + } trtcm_rfc2698; + + /** Items only valid when *alg* is set to trTCM - RFC 4115. */ + struct { + /** Committed Information Rate (CIR) (bytes/second). */ + uint64_t cir; + + /** Excess Information Rate (EIR) (bytes/second). */ + uint64_t eir; + + /** Committed Burst Size (CBS) (byes). */ + uint64_t cbs; + + /** Excess Burst Size (EBS) (bytes). */ + uint64_t ebs; + } trtcm_rfc4115; + }; +}; + +/** + * Policer actions + */ +enum rte_mtr_policer_action { + /** Recolor the packet as green. */ + MTR_POLICER_ACTION_COLOR_GREEN = 0, + + /** Recolor the packet as yellow. */ + MTR_POLICER_ACTION_COLOR_YELLOW, + + /** Recolor the packet as red. */ + MTR_POLICER_ACTION_COLOR_RED, + + /** Drop the packet. */ + MTR_POLICER_ACTION_DROP, +}; + +/** + * Parameters for each traffic metering & policing object + * + * @see enum rte_mtr_stats_type + */ +struct rte_mtr_params { + /** Meter profile ID. */ + uint32_t meter_profile_id; + + /** Meter input color in case of MTR object chaining. When non-zero: if + * a previous MTR object is enabled in the same flow, then the color + * determined by the latest MTR object in the same flow is used as the + * input color by the current MTR object, otherwise the current MTR + * object uses the *dscp_table* to determine the input color. When zero: + * the color determined by any previous MTR object in same flow is + * ignored by the current MTR object, which uses the *dscp_table* to + * determine the input color. + */ + int use_prev_mtr_color; + + /** Meter input color. When non-NULL: it points to a pre-allocated and + * pre-populated table with exactly 64 elements providing the input + * color for each value of the IPv4/IPv6 Differentiated Services Code + * Point (DSCP) input packet field. When NULL: it is equivalent to + * setting this parameter to an all-green populated table (i.e. table + * with all the 64 elements set to green color). The color blind mode + * is configured by setting *use_prev_mtr_color* to 0 and *dscp_table* + * to either NULL or to an all-green populated table. When + * *use_prev_mtr_color* is non-zero value or when *dscp_table* contains + * at least one yellow or red color element, then the color aware mode + * is configured. + */ + enum rte_mtr_color *dscp_table; + + /** Non-zero to enable the meter, zero to disable the meter at the time + * of MTR object creation. Ignored when the meter profile indicated by + * *meter_profile_id* is set to NONE. + * @see rte_mtr_meter_disable() + */ + int meter_enable; + + /** Policer actions (per meter output color). */ + enum rte_mtr_policer_action action[RTE_MTR_COLORS]; + + /** Set of stats counters to be enabled. + * @see enum rte_mtr_stats_type + */ + uint64_t stats_mask; +}; + +/** + * MTR capabilities + */ +struct rte_mtr_capabilities { + /** Maximum number of MTR objects. */ + uint32_t n_max; + + /** Maximum number of MTR objects that can be shared by multiple flows. + * The value of zero indicates that shared MTR objects are not + * supported. The maximum value is *n_max*. + */ + uint32_t n_shared_max; + + /** When non-zero, this flag indicates that all the MTR objects that + * cannot be shared by multiple flows have identical capability set. + */ + int identical; + + /** When non-zero, this flag indicates that all the MTR objects that + * can be shared by multiple flows have identical capability set. + */ + int shared_identical; + + /** Maximum number of flows that can share the same MTR object. The + * value of zero is invalid. The value of 1 means that shared MTR + * objects not supported. + */ + uint32_t shared_n_flows_per_mtr_max; + + /** Maximum number of MTR objects that can be part of the same flow. The + * value of zero is invalid. The value of 1 indicates that MTR object + * chaining is not supported. The maximum value is *n_max*. + */ + uint32_t chaining_n_mtrs_per_flow_max; + + /** + * When non-zero, it indicates that the packet color identified by one + * MTR object can be used as the packet input color by any subsequent + * MTR object from the same flow. When zero, it indicates that the color + * determined by one MTR object is always ignored by any subsequent MTR + * object from the same flow. Only valid when MTR chaining is supported, + * i.e. *chaining_n_mtrs_per_flow_max* is greater than 1. When non-zero, + * it also means that the color aware mode is supported by at least one + * metering algorithm. + */ + int chaining_use_prev_mtr_color_supported; + + /** + * When non-zero, it indicates that the packet color identified by one + * MTR object is always used as the packet input color by any subsequent + * MTR object that is part of the same flow. When zero, it indicates + * that whether the color determined by one MTR object is either ignored + * or used as the packet input color by any subsequent MTR object from + * the same flow is individually configurable for each MTR object. Only + * valid when *chaining_use_prev_mtr_color_supported* is non-zero. + */ + int chaining_use_prev_mtr_color_enforced; + + /** Maximum number of MTR objects that can have their meter configured + * to run the srTCM RFC 2697 algorithm. The value of 0 indicates this + * metering algorithm is not supported. The maximum value is *n_max*. + */ + uint32_t meter_srtcm_rfc2697_n_max; + + /** Maximum number of MTR objects that can have their meter configured + * to run the trTCM RFC 2698 algorithm. The value of 0 indicates this + * metering algorithm is not supported. The maximum value is *n_max*. + */ + uint32_t meter_trtcm_rfc2698_n_max; + + /** Maximum number of MTR objects that can have their meter configured + * to run the trTCM RFC 4115 algorithm. The value of 0 indicates this + * metering algorithm is not supported. The maximum value is *n_max*. + */ + uint32_t meter_trtcm_rfc4115_n_max; + + /** Maximum traffic rate that can be metered by a single MTR object. For + * srTCM RFC 2697, this is the maximum CIR rate. For trTCM RFC 2698, + * this is the maximum PIR rate. For trTCM RFC 4115, this is the maximum + * value for the sum of PIR and EIR rates. + */ + uint64_t meter_rate_max; + + /** + * When non-zero, it indicates that color aware mode is supported for + * the srTCM RFC 2697 metering algorithm. + */ + int color_aware_srtcm_rfc2697_supported; + + /** + * When non-zero, it indicates that color aware mode is supported for + * the trTCM RFC 2698 metering algorithm. + */ + int color_aware_trtcm_rfc2698_supported; + + /** + * When non-zero, it indicates that color aware mode is supported for + * the trTCM RFC 4115 metering algorithm. + */ + int color_aware_trtcm_rfc4115_supported; + + /** When non-zero, it indicates that the policer packet recolor actions + * are supported. + * @see enum rte_mtr_policer_action + */ + int policer_action_recolor_supported; + + /** When non-zero, it indicates that the policer packet drop action is + * supported. + * @see enum rte_mtr_policer_action + */ + int policer_action_drop_supported; + + /** Set of supported statistics counter types. + * @see enum rte_mtr_stats_type + */ + uint64_t stats_mask; +}; + +/** + * Verbose error types. + * + * Most of them provide the type of the object referenced by struct + * rte_mtr_error::cause. + */ +enum rte_mtr_error_type { + RTE_MTR_ERROR_TYPE_NONE, /**< No error. */ + RTE_MTR_ERROR_TYPE_UNSPECIFIED, /**< Cause unspecified. */ + RTE_MTR_ERROR_TYPE_METER_PROFILE_ID, + RTE_MTR_ERROR_TYPE_METER_PROFILE, + RTE_MTR_ERROR_TYPE_MTR_ID, + RTE_MTR_ERROR_TYPE_MTR_PARAMS, + RTE_MTR_ERROR_TYPE_POLICER_ACTION_GREEN, + RTE_MTR_ERROR_TYPE_POLICER_ACTION_YELLOW, + RTE_MTR_ERROR_TYPE_POLICER_ACTION_RED, + RTE_MTR_ERROR_TYPE_STATS_MASK, + RTE_MTR_ERROR_TYPE_STATS, + RTE_MTR_ERROR_TYPE_SHARED, +}; + +/** + * Verbose error structure definition. + * + * This object is normally allocated by applications and set by PMDs, the + * message points to a constant string which does not need to be freed by + * the application, however its pointer can be considered valid only as long + * as its associated DPDK port remains configured. Closing the underlying + * device or unloading the PMD invalidates it. + * + * Both cause and message may be NULL regardless of the error type. + */ +struct rte_mtr_error { + enum rte_mtr_error_type type; /**< Cause field and error type. */ + const void *cause; /**< Object responsible for the error. */ + const char *message; /**< Human-readable error message. */ +}; + +/** + * MTR capabilities get + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[out] cap + * MTR capabilities. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_capabilities_get(uint16_t port_id, + struct rte_mtr_capabilities *cap, + struct rte_mtr_error *error); + +/** + * Meter profile add + * + * Create a new meter profile with ID set to *meter_profile_id*. The new profile + * is used to create one or several MTR objects. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] meter_profile_id + * ID for the new meter profile. Needs to be unused by any of the existing + * meter profiles added for the current port. + * @param[in] profile + * Meter profile parameters. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_meter_profile_add(uint16_t port_id, + uint32_t meter_profile_id, + struct rte_mtr_meter_profile *profile, + struct rte_mtr_error *error); + +/** + * Meter profile delete + * + * Delete an existing meter profile. This operation fails when there is + * currently at least one user (i.e. MTR object) of this profile. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] meter_profile_id + * Meter profile ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_meter_profile_delete(uint16_t port_id, + uint32_t meter_profile_id, + struct rte_mtr_error *error); + +/** + * MTR object create + * + * Create a new MTR object for the current port. This object is run as part of + * associated flow action for traffic metering and policing. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be unused by any of the existing MTR objects. + * created for the current port. + * @param[in] params + * MTR object params. Needs to be pre-allocated and valid. + * @param[in] shared + * Non-zero when this MTR object can be shared by multiple flows, zero when + * this MTR object can be used by a single flow. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see enum rte_flow_action_type::RTE_FLOW_ACTION_TYPE_METER + */ +int __rte_experimental +rte_mtr_create(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_params *params, + int shared, + struct rte_mtr_error *error); + +/** + * MTR object destroy + * + * Delete an existing MTR object. This operation fails when there is currently + * at least one user (i.e. flow) of this MTR object. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be valid. + * created for the current port. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_destroy(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_error *error); + +/** + * MTR object meter disable + * + * Disable the meter of an existing MTR object. In disabled state, the meter of + * the current MTR object works in pass-through mode, meaning that for each + * input packet the meter output color is always the same as the input color. In + * particular, when the meter of the current MTR object is configured in color + * blind mode, the input color is always green, so the meter output color is + * also always green. Note that the policer and the statistics of the current + * MTR object are working as usual while the meter is disabled. No action is + * taken and this function returns successfully when the meter of the current + * MTR object is already disabled. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_meter_disable(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_error *error); + +/** + * MTR object meter enable + * + * Enable the meter of an existing MTR object. If the MTR object has its meter + * already enabled, then no action is taken and this function returns + * successfully. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_meter_enable(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_error *error); + +/** + * MTR object meter profile update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be valid. + * @param[in] meter_profile_id + * Meter profile ID for the current MTR object. Needs to be valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_meter_profile_update(uint16_t port_id, + uint32_t mtr_id, + uint32_t meter_profile_id, + struct rte_mtr_error *error); + +/** + * MTR object DSCP table update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be valid. + * @param[in] dscp_table + * When non-NULL: it points to a pre-allocated and pre-populated table with + * exactly 64 elements providing the input color for each value of the + * IPv4/IPv6 Differentiated Services Code Point (DSCP) input packet field. + * When NULL: it is equivalent to setting this parameter to an “all-green” + * populated table (i.e. table with all the 64 elements set to green color). + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_meter_dscp_table_update(uint16_t port_id, + uint32_t mtr_id, + enum rte_mtr_color *dscp_table, + struct rte_mtr_error *error); + +/** + * MTR object policer actions update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be valid. + * @param[in] action_mask + * Bit mask indicating which policer actions need to be updated. One or more + * policer actions can be updated in a single function invocation. To update + * the policer action associated with color C, bit (1 << C) needs to be set in + * *action_mask* and element at position C in the *actions* array needs to be + * valid. + * @param[in] actions + * Pre-allocated and pre-populated array of policer actions. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_mtr_policer_actions_update(uint16_t port_id, + uint32_t mtr_id, + uint32_t action_mask, + enum rte_mtr_policer_action *actions, + struct rte_mtr_error *error); + +/** + * MTR object enabled statistics counters update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be valid. + * @param[in] stats_mask + * Mask of statistics counter types to be enabled for the current MTR object. + * Any statistics counter type not included in this set is to be disabled for + * the current MTR object. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see enum rte_mtr_stats_type + */ +int __rte_experimental +rte_mtr_stats_update(uint16_t port_id, + uint32_t mtr_id, + uint64_t stats_mask, + struct rte_mtr_error *error); + +/** + * MTR object statistics counters read + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mtr_id + * MTR object ID. Needs to be valid. + * @param[out] stats + * When non-NULL, it contains the current value for the statistics counters + * enabled for the current MTR object. + * @param[out] stats_mask + * When non-NULL, it contains the mask of statistics counter types that are + * currently enabled for this MTR object, indicating which of the counters + * retrieved with the *stats* structure are valid. + * @param[in] clear + * When this parameter has a non-zero value, the statistics counters are + * cleared (i.e. set to zero) immediately after they have been read, + * otherwise the statistics counters are left untouched. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see enum rte_mtr_stats_type + */ +int __rte_experimental +rte_mtr_stats_read(uint16_t port_id, + uint32_t mtr_id, + struct rte_mtr_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_mtr_error *error); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_MTR_H__ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_mtr_driver.h b/src/spdk/dpdk/lib/librte_ethdev/rte_mtr_driver.h new file mode 100644 index 00000000..c9a6d7c3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_mtr_driver.h @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_MTR_DRIVER_H__ +#define __INCLUDE_RTE_MTR_DRIVER_H__ + +/** + * @file + * RTE Generic Traffic Metering and Policing API (Driver Side) + * + * This file provides implementation helpers for internal use by PMDs, they + * are not intended to be exposed to applications and are not subject to ABI + * versioning. + */ + +#include + +#include +#include "rte_ethdev.h" +#include "rte_mtr.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int (*rte_mtr_capabilities_get_t)(struct rte_eth_dev *dev, + struct rte_mtr_capabilities *cap, + struct rte_mtr_error *error); +/**< @internal MTR capabilities get */ + +typedef int (*rte_mtr_meter_profile_add_t)(struct rte_eth_dev *dev, + uint32_t meter_profile_id, + struct rte_mtr_meter_profile *profile, + struct rte_mtr_error *error); +/**< @internal MTR meter profile add */ + +typedef int (*rte_mtr_meter_profile_delete_t)(struct rte_eth_dev *dev, + uint32_t meter_profile_id, + struct rte_mtr_error *error); +/**< @internal MTR meter profile delete */ + +typedef int (*rte_mtr_create_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + struct rte_mtr_params *params, + int shared, + struct rte_mtr_error *error); +/**< @internal MTR object create */ + +typedef int (*rte_mtr_destroy_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + struct rte_mtr_error *error); +/**< @internal MTR object destroy */ + +typedef int (*rte_mtr_meter_enable_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + struct rte_mtr_error *error); +/**< @internal MTR object meter enable */ + +typedef int (*rte_mtr_meter_disable_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + struct rte_mtr_error *error); +/**< @internal MTR object meter disable */ + +typedef int (*rte_mtr_meter_profile_update_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + uint32_t meter_profile_id, + struct rte_mtr_error *error); +/**< @internal MTR object meter profile update */ + +typedef int (*rte_mtr_meter_dscp_table_update_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + enum rte_mtr_color *dscp_table, + struct rte_mtr_error *error); +/**< @internal MTR object meter DSCP table update */ + +typedef int (*rte_mtr_policer_actions_update_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + uint32_t action_mask, + enum rte_mtr_policer_action *actions, + struct rte_mtr_error *error); +/**< @internal MTR object policer action update*/ + +typedef int (*rte_mtr_stats_update_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + uint64_t stats_mask, + struct rte_mtr_error *error); +/**< @internal MTR object enabled stats update */ + +typedef int (*rte_mtr_stats_read_t)(struct rte_eth_dev *dev, + uint32_t mtr_id, + struct rte_mtr_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_mtr_error *error); +/**< @internal MTR object stats read */ + +struct rte_mtr_ops { + /** MTR capabilities get */ + rte_mtr_capabilities_get_t capabilities_get; + + /** MTR meter profile add */ + rte_mtr_meter_profile_add_t meter_profile_add; + + /** MTR meter profile delete */ + rte_mtr_meter_profile_delete_t meter_profile_delete; + + /** MTR object create */ + rte_mtr_create_t create; + + /** MTR object destroy */ + rte_mtr_destroy_t destroy; + + /** MTR object meter enable */ + rte_mtr_meter_enable_t meter_enable; + + /** MTR object meter disable */ + rte_mtr_meter_disable_t meter_disable; + + /** MTR object meter profile update */ + rte_mtr_meter_profile_update_t meter_profile_update; + + /** MTR object meter DSCP table update */ + rte_mtr_meter_dscp_table_update_t meter_dscp_table_update; + + /** MTR object policer action update */ + rte_mtr_policer_actions_update_t policer_actions_update; + + /** MTR object enabled stats update */ + rte_mtr_stats_update_t stats_update; + + /** MTR object stats read */ + rte_mtr_stats_read_t stats_read; +}; + +/** + * Initialize generic error structure. + * + * This function also sets rte_errno to a given value. + * + * @param[out] error + * Pointer to error structure (may be NULL). + * @param[in] code + * Related error code (rte_errno). + * @param[in] type + * Cause field and error type. + * @param[in] cause + * Object responsible for the error. + * @param[in] message + * Human-readable error message. + * + * @return + * Error code. + */ +static inline int +rte_mtr_error_set(struct rte_mtr_error *error, + int code, + enum rte_mtr_error_type type, + const void *cause, + const char *message) +{ + if (error) { + *error = (struct rte_mtr_error){ + .type = type, + .cause = cause, + .message = message, + }; + } + rte_errno = code; + return code; +} + +/** + * Get generic traffic metering and policing operations structure from a port + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[out] error + * Error details + * + * @return + * The traffic metering and policing operations structure associated with + * port_id on success, NULL otherwise. + */ +const struct rte_mtr_ops * +rte_mtr_ops_get(uint16_t port_id, struct rte_mtr_error *error); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_MTR_DRIVER_H__ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_tm.c b/src/spdk/dpdk/lib/librte_ethdev/rte_tm.c new file mode 100644 index 00000000..9709454f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_tm.c @@ -0,0 +1,409 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include + +#include +#include "rte_ethdev.h" +#include "rte_tm_driver.h" +#include "rte_tm.h" + +/* Get generic traffic manager operations structure from a port. */ +const struct rte_tm_ops * +rte_tm_ops_get(uint16_t port_id, struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_tm_ops *ops; + + if (!rte_eth_dev_is_valid_port(port_id)) { + rte_tm_error_set(error, + ENODEV, + RTE_TM_ERROR_TYPE_UNSPECIFIED, + NULL, + rte_strerror(ENODEV)); + return NULL; + } + + if ((dev->dev_ops->tm_ops_get == NULL) || + (dev->dev_ops->tm_ops_get(dev, &ops) != 0) || + (ops == NULL)) { + rte_tm_error_set(error, + ENOSYS, + RTE_TM_ERROR_TYPE_UNSPECIFIED, + NULL, + rte_strerror(ENOSYS)); + return NULL; + } + + return ops; +} + +#define RTE_TM_FUNC(port_id, func) \ +({ \ + const struct rte_tm_ops *ops = \ + rte_tm_ops_get(port_id, error); \ + if (ops == NULL) \ + return -rte_errno; \ + \ + if (ops->func == NULL) \ + return -rte_tm_error_set(error, \ + ENOSYS, \ + RTE_TM_ERROR_TYPE_UNSPECIFIED, \ + NULL, \ + rte_strerror(ENOSYS)); \ + \ + ops->func; \ +}) + +/* Get number of leaf nodes */ +int +rte_tm_get_number_of_leaf_nodes(uint16_t port_id, + uint32_t *n_leaf_nodes, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + const struct rte_tm_ops *ops = + rte_tm_ops_get(port_id, error); + + if (ops == NULL) + return -rte_errno; + + if (n_leaf_nodes == NULL) { + rte_tm_error_set(error, + EINVAL, + RTE_TM_ERROR_TYPE_UNSPECIFIED, + NULL, + rte_strerror(EINVAL)); + return -rte_errno; + } + + *n_leaf_nodes = dev->data->nb_tx_queues; + return 0; +} + +/* Check node type (leaf or non-leaf) */ +int +rte_tm_node_type_get(uint16_t port_id, + uint32_t node_id, + int *is_leaf, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_type_get)(dev, + node_id, is_leaf, error); +} + +/* Get capabilities */ +int rte_tm_capabilities_get(uint16_t port_id, + struct rte_tm_capabilities *cap, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, capabilities_get)(dev, + cap, error); +} + +/* Get level capabilities */ +int rte_tm_level_capabilities_get(uint16_t port_id, + uint32_t level_id, + struct rte_tm_level_capabilities *cap, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, level_capabilities_get)(dev, + level_id, cap, error); +} + +/* Get node capabilities */ +int rte_tm_node_capabilities_get(uint16_t port_id, + uint32_t node_id, + struct rte_tm_node_capabilities *cap, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_capabilities_get)(dev, + node_id, cap, error); +} + +/* Add WRED profile */ +int rte_tm_wred_profile_add(uint16_t port_id, + uint32_t wred_profile_id, + struct rte_tm_wred_params *profile, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, wred_profile_add)(dev, + wred_profile_id, profile, error); +} + +/* Delete WRED profile */ +int rte_tm_wred_profile_delete(uint16_t port_id, + uint32_t wred_profile_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, wred_profile_delete)(dev, + wred_profile_id, error); +} + +/* Add/update shared WRED context */ +int rte_tm_shared_wred_context_add_update(uint16_t port_id, + uint32_t shared_wred_context_id, + uint32_t wred_profile_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, shared_wred_context_add_update)(dev, + shared_wred_context_id, wred_profile_id, error); +} + +/* Delete shared WRED context */ +int rte_tm_shared_wred_context_delete(uint16_t port_id, + uint32_t shared_wred_context_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, shared_wred_context_delete)(dev, + shared_wred_context_id, error); +} + +/* Add shaper profile */ +int rte_tm_shaper_profile_add(uint16_t port_id, + uint32_t shaper_profile_id, + struct rte_tm_shaper_params *profile, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, shaper_profile_add)(dev, + shaper_profile_id, profile, error); +} + +/* Delete WRED profile */ +int rte_tm_shaper_profile_delete(uint16_t port_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, shaper_profile_delete)(dev, + shaper_profile_id, error); +} + +/* Add shared shaper */ +int rte_tm_shared_shaper_add_update(uint16_t port_id, + uint32_t shared_shaper_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, shared_shaper_add_update)(dev, + shared_shaper_id, shaper_profile_id, error); +} + +/* Delete shared shaper */ +int rte_tm_shared_shaper_delete(uint16_t port_id, + uint32_t shared_shaper_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, shared_shaper_delete)(dev, + shared_shaper_id, error); +} + +/* Add node to port traffic manager hierarchy */ +int rte_tm_node_add(uint16_t port_id, + uint32_t node_id, + uint32_t parent_node_id, + uint32_t priority, + uint32_t weight, + uint32_t level_id, + struct rte_tm_node_params *params, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_add)(dev, + node_id, parent_node_id, priority, weight, level_id, + params, error); +} + +/* Delete node from traffic manager hierarchy */ +int rte_tm_node_delete(uint16_t port_id, + uint32_t node_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_delete)(dev, + node_id, error); +} + +/* Suspend node */ +int rte_tm_node_suspend(uint16_t port_id, + uint32_t node_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_suspend)(dev, + node_id, error); +} + +/* Resume node */ +int rte_tm_node_resume(uint16_t port_id, + uint32_t node_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_resume)(dev, + node_id, error); +} + +/* Commit the initial port traffic manager hierarchy */ +int rte_tm_hierarchy_commit(uint16_t port_id, + int clear_on_fail, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, hierarchy_commit)(dev, + clear_on_fail, error); +} + +/* Update node parent */ +int rte_tm_node_parent_update(uint16_t port_id, + uint32_t node_id, + uint32_t parent_node_id, + uint32_t priority, + uint32_t weight, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_parent_update)(dev, + node_id, parent_node_id, priority, weight, error); +} + +/* Update node private shaper */ +int rte_tm_node_shaper_update(uint16_t port_id, + uint32_t node_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_shaper_update)(dev, + node_id, shaper_profile_id, error); +} + +/* Update node shared shapers */ +int rte_tm_node_shared_shaper_update(uint16_t port_id, + uint32_t node_id, + uint32_t shared_shaper_id, + int add, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_shared_shaper_update)(dev, + node_id, shared_shaper_id, add, error); +} + +/* Update node stats */ +int rte_tm_node_stats_update(uint16_t port_id, + uint32_t node_id, + uint64_t stats_mask, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_stats_update)(dev, + node_id, stats_mask, error); +} + +/* Update WFQ weight mode */ +int rte_tm_node_wfq_weight_mode_update(uint16_t port_id, + uint32_t node_id, + int *wfq_weight_mode, + uint32_t n_sp_priorities, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_wfq_weight_mode_update)(dev, + node_id, wfq_weight_mode, n_sp_priorities, error); +} + +/* Update node congestion management mode */ +int rte_tm_node_cman_update(uint16_t port_id, + uint32_t node_id, + enum rte_tm_cman_mode cman, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_cman_update)(dev, + node_id, cman, error); +} + +/* Update node private WRED context */ +int rte_tm_node_wred_context_update(uint16_t port_id, + uint32_t node_id, + uint32_t wred_profile_id, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_wred_context_update)(dev, + node_id, wred_profile_id, error); +} + +/* Update node shared WRED context */ +int rte_tm_node_shared_wred_context_update(uint16_t port_id, + uint32_t node_id, + uint32_t shared_wred_context_id, + int add, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_shared_wred_context_update)(dev, + node_id, shared_wred_context_id, add, error); +} + +/* Read and/or clear stats counters for specific node */ +int rte_tm_node_stats_read(uint16_t port_id, + uint32_t node_id, + struct rte_tm_node_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, node_stats_read)(dev, + node_id, stats, stats_mask, clear, error); +} + +/* Packet marking - VLAN DEI */ +int rte_tm_mark_vlan_dei(uint16_t port_id, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, mark_vlan_dei)(dev, + mark_green, mark_yellow, mark_red, error); +} + +/* Packet marking - IPv4/IPv6 ECN */ +int rte_tm_mark_ip_ecn(uint16_t port_id, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, mark_ip_ecn)(dev, + mark_green, mark_yellow, mark_red, error); +} + +/* Packet marking - IPv4/IPv6 DSCP */ +int rte_tm_mark_ip_dscp(uint16_t port_id, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error) +{ + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + return RTE_TM_FUNC(port_id, mark_ip_dscp)(dev, + mark_green, mark_yellow, mark_red, error); +} diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_tm.h b/src/spdk/dpdk/lib/librte_ethdev/rte_tm.h new file mode 100644 index 00000000..955f02ff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_tm.h @@ -0,0 +1,1965 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 Cavium. + * Copyright(c) 2017 NXP. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __INCLUDE_RTE_TM_H__ +#define __INCLUDE_RTE_TM_H__ + +/** + * @file + * RTE Generic Traffic Manager API + * + * This interface provides the ability to configure the traffic manager in a + * generic way. It includes features such as: hierarchical scheduling, + * traffic shaping, congestion management, packet marking, etc. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Ethernet framing overhead. + * + * Overhead fields per Ethernet frame: + * 1. Preamble: 7 bytes; + * 2. Start of Frame Delimiter (SFD): 1 byte; + * 3. Inter-Frame Gap (IFG): 12 bytes. + * + * One of the typical values for the *pkt_length_adjust* field of the shaper + * profile. + * + * @see struct rte_tm_shaper_params + */ +#define RTE_TM_ETH_FRAMING_OVERHEAD 20 + +/** + * Ethernet framing overhead including the Frame Check Sequence (FCS) field. + * Useful when FCS is generated and added at the end of the Ethernet frame on + * TX side without any SW intervention. + * + * One of the typical values for the pkt_length_adjust field of the shaper + * profile. + * + * @see struct rte_tm_shaper_params + */ +#define RTE_TM_ETH_FRAMING_OVERHEAD_FCS 24 + +/** + * Invalid WRED profile ID. + * + * @see struct rte_tm_node_params + * @see rte_tm_node_add() + * @see rte_tm_node_wred_context_update() + */ +#define RTE_TM_WRED_PROFILE_ID_NONE UINT32_MAX + +/** + *Invalid shaper profile ID. + * + * @see struct rte_tm_node_params + * @see rte_tm_node_add() + * @see rte_tm_node_shaper_update() + */ +#define RTE_TM_SHAPER_PROFILE_ID_NONE UINT32_MAX + +/** + * Node ID for the parent of the root node. + * + * @see rte_tm_node_add() + */ +#define RTE_TM_NODE_ID_NULL UINT32_MAX + +/** + * Node level ID used to disable level ID checking. + * + * @see rte_tm_node_add() + */ +#define RTE_TM_NODE_LEVEL_ID_ANY UINT32_MAX + +/** + * Color + */ +enum rte_tm_color { + RTE_TM_GREEN = 0, /**< Green */ + RTE_TM_YELLOW, /**< Yellow */ + RTE_TM_RED, /**< Red */ + RTE_TM_COLORS /**< Number of colors */ +}; + +/** + * Node statistics counter type + */ +enum rte_tm_stats_type { + /** Number of packets scheduled from current node. */ + RTE_TM_STATS_N_PKTS = 1 << 0, + + /** Number of bytes scheduled from current node. */ + RTE_TM_STATS_N_BYTES = 1 << 1, + + /** Number of green packets dropped by current leaf node. */ + RTE_TM_STATS_N_PKTS_GREEN_DROPPED = 1 << 2, + + /** Number of yellow packets dropped by current leaf node. */ + RTE_TM_STATS_N_PKTS_YELLOW_DROPPED = 1 << 3, + + /** Number of red packets dropped by current leaf node. */ + RTE_TM_STATS_N_PKTS_RED_DROPPED = 1 << 4, + + /** Number of green bytes dropped by current leaf node. */ + RTE_TM_STATS_N_BYTES_GREEN_DROPPED = 1 << 5, + + /** Number of yellow bytes dropped by current leaf node. */ + RTE_TM_STATS_N_BYTES_YELLOW_DROPPED = 1 << 6, + + /** Number of red bytes dropped by current leaf node. */ + RTE_TM_STATS_N_BYTES_RED_DROPPED = 1 << 7, + + /** Number of packets currently waiting in the packet queue of current + * leaf node. + */ + RTE_TM_STATS_N_PKTS_QUEUED = 1 << 8, + + /** Number of bytes currently waiting in the packet queue of current + * leaf node. + */ + RTE_TM_STATS_N_BYTES_QUEUED = 1 << 9, +}; + +/** + * Node statistics counters + */ +struct rte_tm_node_stats { + /** Number of packets scheduled from current node. */ + uint64_t n_pkts; + + /** Number of bytes scheduled from current node. */ + uint64_t n_bytes; + + /** Statistics counters for leaf nodes only. */ + struct { + /** Number of packets dropped by current leaf node per each + * color. + */ + uint64_t n_pkts_dropped[RTE_TM_COLORS]; + + /** Number of bytes dropped by current leaf node per each + * color. + */ + uint64_t n_bytes_dropped[RTE_TM_COLORS]; + + /** Number of packets currently waiting in the packet queue of + * current leaf node. + */ + uint64_t n_pkts_queued; + + /** Number of bytes currently waiting in the packet queue of + * current leaf node. + */ + uint64_t n_bytes_queued; + } leaf; +}; + +/** + * Traffic manager dynamic updates + */ +enum rte_tm_dynamic_update_type { + /** Dynamic parent node update. The new parent node is located on same + * hierarchy level as the former parent node. Consequently, the node + * whose parent is changed preserves its hierarchy level. + */ + RTE_TM_UPDATE_NODE_PARENT_KEEP_LEVEL = 1 << 0, + + /** Dynamic parent node update. The new parent node is located on + * different hierarchy level than the former parent node. Consequently, + * the node whose parent is changed also changes its hierarchy level. + */ + RTE_TM_UPDATE_NODE_PARENT_CHANGE_LEVEL = 1 << 1, + + /** Dynamic node add/delete. */ + RTE_TM_UPDATE_NODE_ADD_DELETE = 1 << 2, + + /** Suspend/resume nodes. */ + RTE_TM_UPDATE_NODE_SUSPEND_RESUME = 1 << 3, + + /** Dynamic switch between byte-based and packet-based WFQ weights. */ + RTE_TM_UPDATE_NODE_WFQ_WEIGHT_MODE = 1 << 4, + + /** Dynamic update on number of SP priorities. */ + RTE_TM_UPDATE_NODE_N_SP_PRIORITIES = 1 << 5, + + /** Dynamic update of congestion management mode for leaf nodes. */ + RTE_TM_UPDATE_NODE_CMAN = 1 << 6, + + /** Dynamic update of the set of enabled stats counter types. */ + RTE_TM_UPDATE_NODE_STATS = 1 << 7, +}; + +/** + * Traffic manager capabilities + */ +struct rte_tm_capabilities { + /** Maximum number of nodes. */ + uint32_t n_nodes_max; + + /** Maximum number of levels (i.e. number of nodes connecting the root + * node with any leaf node, including the root and the leaf). + */ + uint32_t n_levels_max; + + /** When non-zero, this flag indicates that all the non-leaf nodes + * (with the exception of the root node) have identical capability set. + */ + int non_leaf_nodes_identical; + + /** When non-zero, this flag indicates that all the leaf nodes have + * identical capability set. + */ + int leaf_nodes_identical; + + /** Maximum number of shapers, either private or shared. In case the + * implementation does not share any resources between private and + * shared shapers, it is typically equal to the sum of + * *shaper_private_n_max* and *shaper_shared_n_max*. The + * value of zero indicates that traffic shaping is not supported. + */ + uint32_t shaper_n_max; + + /** Maximum number of private shapers. Indicates the maximum number of + * nodes that can concurrently have their private shaper enabled. The + * value of zero indicates that private shapers are not supported. + */ + uint32_t shaper_private_n_max; + + /** Maximum number of private shapers that support dual rate shaping. + * Indicates the maximum number of nodes that can concurrently have + * their private shaper enabled with dual rate support. Only valid when + * private shapers are supported. The value of zero indicates that dual + * rate shaping is not available for private shapers. The maximum value + * is *shaper_private_n_max*. + */ + int shaper_private_dual_rate_n_max; + + /** Minimum committed/peak rate (bytes per second) for any private + * shaper. Valid only when private shapers are supported. + */ + uint64_t shaper_private_rate_min; + + /** Maximum committed/peak rate (bytes per second) for any private + * shaper. Valid only when private shapers are supported. + */ + uint64_t shaper_private_rate_max; + + /** Maximum number of shared shapers. The value of zero indicates that + * shared shapers are not supported. + */ + uint32_t shaper_shared_n_max; + + /** Maximum number of nodes that can share the same shared shaper. + * Only valid when shared shapers are supported. + */ + uint32_t shaper_shared_n_nodes_per_shaper_max; + + /** Maximum number of shared shapers a node can be part of. This + * parameter indicates that there is at least one node that can be + * configured with this many shared shapers, which might not be true for + * all the nodes. Only valid when shared shapers are supported, in which + * case it ranges from 1 to *shaper_shared_n_max*. + */ + uint32_t shaper_shared_n_shapers_per_node_max; + + /** Maximum number of shared shapers that can be configured with dual + * rate shaping. The value of zero indicates that dual rate shaping + * support is not available for shared shapers. + */ + uint32_t shaper_shared_dual_rate_n_max; + + /** Minimum committed/peak rate (bytes per second) for any shared + * shaper. Only valid when shared shapers are supported. + */ + uint64_t shaper_shared_rate_min; + + /** Maximum committed/peak rate (bytes per second) for any shared + * shaper. Only valid when shared shapers are supported. + */ + uint64_t shaper_shared_rate_max; + + /** Minimum value allowed for packet length adjustment for any private + * or shared shaper. + */ + int shaper_pkt_length_adjust_min; + + /** Maximum value allowed for packet length adjustment for any private + * or shared shaper. + */ + int shaper_pkt_length_adjust_max; + + /** Maximum number of children nodes. This parameter indicates that + * there is at least one non-leaf node that can be configured with this + * many children nodes, which might not be true for all the non-leaf + * nodes. + */ + uint32_t sched_n_children_max; + + /** Maximum number of supported priority levels. This parameter + * indicates that there is at least one non-leaf node that can be + * configured with this many priority levels for managing its children + * nodes, which might not be true for all the non-leaf nodes. The value + * of zero is invalid. The value of 1 indicates that only priority 0 is + * supported, which essentially means that Strict Priority (SP) + * algorithm is not supported. + */ + uint32_t sched_sp_n_priorities_max; + + /** Maximum number of sibling nodes that can have the same priority at + * any given time, i.e. maximum size of the WFQ sibling node group. This + * parameter indicates there is at least one non-leaf node that meets + * this condition, which might not be true for all the non-leaf nodes. + * The value of zero is invalid. The value of 1 indicates that WFQ + * algorithm is not supported. The maximum value is + * *sched_n_children_max*. + */ + uint32_t sched_wfq_n_children_per_group_max; + + /** Maximum number of priority levels that can have more than one child + * node at any given time, i.e. maximum number of WFQ sibling node + * groups that have two or more members. This parameter indicates there + * is at least one non-leaf node that meets this condition, which might + * not be true for all the non-leaf nodes. The value of zero states that + * WFQ algorithm is not supported. The value of 1 indicates that + * (*sched_sp_n_priorities_max* - 1) priority levels have at most one + * child node, so there can be only one priority level with two or + * more sibling nodes making up a WFQ group. The maximum value is: + * min(floor(*sched_n_children_max* / 2), *sched_sp_n_priorities_max*). + */ + uint32_t sched_wfq_n_groups_max; + + /** Maximum WFQ weight. The value of 1 indicates that all sibling nodes + * with same priority have the same WFQ weight, so WFQ is reduced to FQ. + */ + uint32_t sched_wfq_weight_max; + + /** WRED packet mode support. When non-zero, this parameter indicates + * that there is atleast one leaf node that supports the WRED packet + * mode, which might not be true for all the leaf nodes. In packet + * mode, the WRED thresholds specify the queue length in packets, as + * opposed to bytes. + */ + int cman_wred_packet_mode_supported; + + /** WRED byte mode support. When non-zero, this parameter indicates that + * there is atleast one leaf node that supports the WRED byte mode, + * which might not be true for all the leaf nodes. In byte mode, the + * WRED thresholds specify the queue length in bytes, as opposed to + * packets. + */ + int cman_wred_byte_mode_supported; + + /** Head drop algorithm support. When non-zero, this parameter + * indicates that there is at least one leaf node that supports the head + * drop algorithm, which might not be true for all the leaf nodes. + */ + int cman_head_drop_supported; + + /** Maximum number of WRED contexts, either private or shared. In case + * the implementation does not share any resources between private and + * shared WRED contexts, it is typically equal to the sum of + * *cman_wred_context_private_n_max* and + * *cman_wred_context_shared_n_max*. The value of zero indicates that + * WRED is not supported. + */ + uint32_t cman_wred_context_n_max; + + /** Maximum number of private WRED contexts. Indicates the maximum + * number of leaf nodes that can concurrently have their private WRED + * context enabled. The value of zero indicates that private WRED + * contexts are not supported. + */ + uint32_t cman_wred_context_private_n_max; + + /** Maximum number of shared WRED contexts. The value of zero + * indicates that shared WRED contexts are not supported. + */ + uint32_t cman_wred_context_shared_n_max; + + /** Maximum number of leaf nodes that can share the same WRED context. + * Only valid when shared WRED contexts are supported. + */ + uint32_t cman_wred_context_shared_n_nodes_per_context_max; + + /** Maximum number of shared WRED contexts a leaf node can be part of. + * This parameter indicates that there is at least one leaf node that + * can be configured with this many shared WRED contexts, which might + * not be true for all the leaf nodes. Only valid when shared WRED + * contexts are supported, in which case it ranges from 1 to + * *cman_wred_context_shared_n_max*. + */ + uint32_t cman_wred_context_shared_n_contexts_per_node_max; + + /** Support for VLAN DEI packet marking (per color). */ + int mark_vlan_dei_supported[RTE_TM_COLORS]; + + /** Support for IPv4/IPv6 ECN marking of TCP packets (per color). */ + int mark_ip_ecn_tcp_supported[RTE_TM_COLORS]; + + /** Support for IPv4/IPv6 ECN marking of SCTP packets (per color). */ + int mark_ip_ecn_sctp_supported[RTE_TM_COLORS]; + + /** Support for IPv4/IPv6 DSCP packet marking (per color). */ + int mark_ip_dscp_supported[RTE_TM_COLORS]; + + /** Set of supported dynamic update operations. + * @see enum rte_tm_dynamic_update_type + */ + uint64_t dynamic_update_mask; + + /** Set of supported statistics counter types. + * @see enum rte_tm_stats_type + */ + uint64_t stats_mask; +}; + +/** + * Traffic manager level capabilities + */ +struct rte_tm_level_capabilities { + /** Maximum number of nodes for the current hierarchy level. */ + uint32_t n_nodes_max; + + /** Maximum number of non-leaf nodes for the current hierarchy level. + * The value of 0 indicates that current level only supports leaf + * nodes. The maximum value is *n_nodes_max*. + */ + uint32_t n_nodes_nonleaf_max; + + /** Maximum number of leaf nodes for the current hierarchy level. The + * value of 0 indicates that current level only supports non-leaf + * nodes. The maximum value is *n_nodes_max*. + */ + uint32_t n_nodes_leaf_max; + + /** When non-zero, this flag indicates that all the non-leaf nodes on + * this level have identical capability set. Valid only when + * *n_nodes_nonleaf_max* is non-zero. + */ + int non_leaf_nodes_identical; + + /** When non-zero, this flag indicates that all the leaf nodes on this + * level have identical capability set. Valid only when + * *n_nodes_leaf_max* is non-zero. + */ + int leaf_nodes_identical; + + RTE_STD_C11 + union { + /** Items valid only for the non-leaf nodes on this level. */ + struct { + /** Private shaper support. When non-zero, it indicates + * there is at least one non-leaf node on this level + * with private shaper support, which may not be the + * case for all the non-leaf nodes on this level. + */ + int shaper_private_supported; + + /** Dual rate support for private shaper. Valid only + * when private shaper is supported for the non-leaf + * nodes on the current level. When non-zero, it + * indicates there is at least one non-leaf node on this + * level with dual rate private shaper support, which + * may not be the case for all the non-leaf nodes on + * this level. + */ + int shaper_private_dual_rate_supported; + + /** Minimum committed/peak rate (bytes per second) for + * private shapers of the non-leaf nodes of this level. + * Valid only when private shaper is supported on this + * level. + */ + uint64_t shaper_private_rate_min; + + /** Maximum committed/peak rate (bytes per second) for + * private shapers of the non-leaf nodes on this level. + * Valid only when private shaper is supported on this + * level. + */ + uint64_t shaper_private_rate_max; + + /** Maximum number of shared shapers that any non-leaf + * node on this level can be part of. The value of zero + * indicates that shared shapers are not supported by + * the non-leaf nodes on this level. When non-zero, it + * indicates there is at least one non-leaf node on this + * level that meets this condition, which may not be the + * case for all the non-leaf nodes on this level. + */ + uint32_t shaper_shared_n_max; + + /** Maximum number of children nodes. This parameter + * indicates that there is at least one non-leaf node on + * this level that can be configured with this many + * children nodes, which might not be true for all the + * non-leaf nodes on this level. + */ + uint32_t sched_n_children_max; + + /** Maximum number of supported priority levels. This + * parameter indicates that there is at least one + * non-leaf node on this level that can be configured + * with this many priority levels for managing its + * children nodes, which might not be true for all the + * non-leaf nodes on this level. The value of zero is + * invalid. The value of 1 indicates that only priority + * 0 is supported, which essentially means that Strict + * Priority (SP) algorithm is not supported on this + * level. + */ + uint32_t sched_sp_n_priorities_max; + + /** Maximum number of sibling nodes that can have the + * same priority at any given time, i.e. maximum size of + * the WFQ sibling node group. This parameter indicates + * there is at least one non-leaf node on this level + * that meets this condition, which may not be true for + * all the non-leaf nodes on this level. The value of + * zero is invalid. The value of 1 indicates that WFQ + * algorithm is not supported on this level. The maximum + * value is *sched_n_children_max*. + */ + uint32_t sched_wfq_n_children_per_group_max; + + /** Maximum number of priority levels that can have + * more than one child node at any given time, i.e. + * maximum number of WFQ sibling node groups that + * have two or more members. This parameter indicates + * there is at least one non-leaf node on this level + * that meets this condition, which might not be true + * for all the non-leaf nodes. The value of zero states + * that WFQ algorithm is not supported on this level. + * The value of 1 indicates that + * (*sched_sp_n_priorities_max* - 1) priority levels on + * this level have at most one child node, so there can + * be only one priority level with two or more sibling + * nodes making up a WFQ group on this level. The + * maximum value is: + * min(floor(*sched_n_children_max* / 2), + * *sched_sp_n_priorities_max*). + */ + uint32_t sched_wfq_n_groups_max; + + /** Maximum WFQ weight. The value of 1 indicates that + * all sibling nodes on this level with same priority + * have the same WFQ weight, so on this level WFQ is + * reduced to FQ. + */ + uint32_t sched_wfq_weight_max; + + /** Mask of statistics counter types supported by the + * non-leaf nodes on this level. Every supported + * statistics counter type is supported by at least one + * non-leaf node on this level, which may not be true + * for all the non-leaf nodes on this level. + * @see enum rte_tm_stats_type + */ + uint64_t stats_mask; + } nonleaf; + + /** Items valid only for the leaf nodes on this level. */ + struct { + /** Private shaper support. When non-zero, it indicates + * there is at least one leaf node on this level with + * private shaper support, which may not be the case for + * all the leaf nodes on this level. + */ + int shaper_private_supported; + + /** Dual rate support for private shaper. Valid only + * when private shaper is supported for the leaf nodes + * on this level. When non-zero, it indicates there is + * at least one leaf node on this level with dual rate + * private shaper support, which may not be the case for + * all the leaf nodes on this level. + */ + int shaper_private_dual_rate_supported; + + /** Minimum committed/peak rate (bytes per second) for + * private shapers of the leaf nodes of this level. + * Valid only when private shaper is supported for the + * leaf nodes on this level. + */ + uint64_t shaper_private_rate_min; + + /** Maximum committed/peak rate (bytes per second) for + * private shapers of the leaf nodes on this level. + * Valid only when private shaper is supported for the + * leaf nodes on this level. + */ + uint64_t shaper_private_rate_max; + + /** Maximum number of shared shapers that any leaf node + * on this level can be part of. The value of zero + * indicates that shared shapers are not supported by + * the leaf nodes on this level. When non-zero, it + * indicates there is at least one leaf node on this + * level that meets this condition, which may not be the + * case for all the leaf nodes on this level. + */ + uint32_t shaper_shared_n_max; + + /** WRED packet mode support. When non-zero, this + * parameter indicates that there is atleast one leaf + * node on this level that supports the WRED packet + * mode, which might not be true for all the leaf + * nodes. In packet mode, the WRED thresholds specify + * the queue length in packets, as opposed to bytes. + */ + int cman_wred_packet_mode_supported; + + /** WRED byte mode support. When non-zero, this + * parameter indicates that there is atleast one leaf + * node on this level that supports the WRED byte mode, + * which might not be true for all the leaf nodes. In + * byte mode, the WRED thresholds specify the queue + * length in bytes, as opposed to packets. + */ + int cman_wred_byte_mode_supported; + + /** Head drop algorithm support. When non-zero, this + * parameter indicates that there is at least one leaf + * node on this level that supports the head drop + * algorithm, which might not be true for all the leaf + * nodes on this level. + */ + int cman_head_drop_supported; + + /** Private WRED context support. When non-zero, it + * indicates there is at least one node on this level + * with private WRED context support, which may not be + * true for all the leaf nodes on this level. + */ + int cman_wred_context_private_supported; + + /** Maximum number of shared WRED contexts that any + * leaf node on this level can be part of. The value of + * zero indicates that shared WRED contexts are not + * supported by the leaf nodes on this level. When + * non-zero, it indicates there is at least one leaf + * node on this level that meets this condition, which + * may not be the case for all the leaf nodes on this + * level. + */ + uint32_t cman_wred_context_shared_n_max; + + /** Mask of statistics counter types supported by the + * leaf nodes on this level. Every supported statistics + * counter type is supported by at least one leaf node + * on this level, which may not be true for all the leaf + * nodes on this level. + * @see enum rte_tm_stats_type + */ + uint64_t stats_mask; + } leaf; + }; +}; + +/** + * Traffic manager node capabilities + */ +struct rte_tm_node_capabilities { + /** Private shaper support for the current node. */ + int shaper_private_supported; + + /** Dual rate shaping support for private shaper of current node. + * Valid only when private shaper is supported by the current node. + */ + int shaper_private_dual_rate_supported; + + /** Minimum committed/peak rate (bytes per second) for private + * shaper of current node. Valid only when private shaper is supported + * by the current node. + */ + uint64_t shaper_private_rate_min; + + /** Maximum committed/peak rate (bytes per second) for private + * shaper of current node. Valid only when private shaper is supported + * by the current node. + */ + uint64_t shaper_private_rate_max; + + /** Maximum number of shared shapers the current node can be part of. + * The value of zero indicates that shared shapers are not supported by + * the current node. + */ + uint32_t shaper_shared_n_max; + + RTE_STD_C11 + union { + /** Items valid only for non-leaf nodes. */ + struct { + /** Maximum number of children nodes. */ + uint32_t sched_n_children_max; + + /** Maximum number of supported priority levels. The + * value of zero is invalid. The value of 1 indicates + * that only priority 0 is supported, which essentially + * means that Strict Priority (SP) algorithm is not + * supported. + */ + uint32_t sched_sp_n_priorities_max; + + /** Maximum number of sibling nodes that can have the + * same priority at any given time, i.e. maximum size + * of the WFQ sibling node group. The value of zero + * is invalid. The value of 1 indicates that WFQ + * algorithm is not supported. The maximum value is + * *sched_n_children_max*. + */ + uint32_t sched_wfq_n_children_per_group_max; + + /** Maximum number of priority levels that can have + * more than one child node at any given time, i.e. + * maximum number of WFQ sibling node groups that have + * two or more members. The value of zero states that + * WFQ algorithm is not supported. The value of 1 + * indicates that (*sched_sp_n_priorities_max* - 1) + * priority levels have at most one child node, so there + * can be only one priority level with two or more + * sibling nodes making up a WFQ group. The maximum + * value is: min(floor(*sched_n_children_max* / 2), + * *sched_sp_n_priorities_max*). + */ + uint32_t sched_wfq_n_groups_max; + + /** Maximum WFQ weight. The value of 1 indicates that + * all sibling nodes with same priority have the same + * WFQ weight, so WFQ is reduced to FQ. + */ + uint32_t sched_wfq_weight_max; + } nonleaf; + + /** Items valid only for leaf nodes. */ + struct { + /** WRED packet mode support for current node. */ + int cman_wred_packet_mode_supported; + + /** WRED byte mode support for current node. */ + int cman_wred_byte_mode_supported; + + /** Head drop algorithm support for current node. */ + int cman_head_drop_supported; + + /** Private WRED context support for current node. */ + int cman_wred_context_private_supported; + + /** Maximum number of shared WRED contexts the current + * node can be part of. The value of zero indicates that + * shared WRED contexts are not supported by the current + * node. + */ + uint32_t cman_wred_context_shared_n_max; + } leaf; + }; + + /** Mask of statistics counter types supported by the current node. + * @see enum rte_tm_stats_type + */ + uint64_t stats_mask; +}; + +/** + * Congestion management (CMAN) mode + * + * This is used for controlling the admission of packets into a packet queue or + * group of packet queues on congestion. On request of writing a new packet + * into the current queue while the queue is full, the *tail drop* algorithm + * drops the new packet while leaving the queue unmodified, as opposed to *head + * drop* algorithm, which drops the packet at the head of the queue (the oldest + * packet waiting in the queue) and admits the new packet at the tail of the + * queue. + * + * The *Random Early Detection (RED)* algorithm works by proactively dropping + * more and more input packets as the queue occupancy builds up. When the queue + * is full or almost full, RED effectively works as *tail drop*. The *Weighted + * RED* algorithm uses a separate set of RED thresholds for each packet color. + */ +enum rte_tm_cman_mode { + RTE_TM_CMAN_TAIL_DROP = 0, /**< Tail drop */ + RTE_TM_CMAN_HEAD_DROP, /**< Head drop */ + RTE_TM_CMAN_WRED, /**< Weighted Random Early Detection (WRED) */ +}; + +/** + * Random Early Detection (RED) profile + */ +struct rte_tm_red_params { + /** Minimum queue threshold */ + uint32_t min_th; + + /** Maximum queue threshold */ + uint32_t max_th; + + /** Inverse of packet marking probability maximum value (maxp), i.e. + * maxp_inv = 1 / maxp + */ + uint16_t maxp_inv; + + /** Negated log2 of queue weight (wq), i.e. wq = 1 / (2 ^ wq_log2) */ + uint16_t wq_log2; +}; + +/** + * Weighted RED (WRED) profile + * + * Multiple WRED contexts can share the same WRED profile. Each leaf node with + * WRED enabled as its congestion management mode has zero or one private WRED + * context (only one leaf node using it) and/or zero, one or several shared + * WRED contexts (multiple leaf nodes use the same WRED context). A private + * WRED context is used to perform congestion management for a single leaf + * node, while a shared WRED context is used to perform congestion management + * for a group of leaf nodes. + * + * @see struct rte_tm_capabilities::cman_wred_packet_mode_supported + * @see struct rte_tm_capabilities::cman_wred_byte_mode_supported + */ +struct rte_tm_wred_params { + /** One set of RED parameters per packet color */ + struct rte_tm_red_params red_params[RTE_TM_COLORS]; + + /** When non-zero, the *min_th* and *max_th* thresholds are specified + * in packets (WRED packet mode). When zero, the *min_th* and *max_th* + * thresholds are specified in bytes (WRED byte mode) + */ + int packet_mode; +}; + +/** + * Token bucket + */ +struct rte_tm_token_bucket { + /** Token bucket rate (bytes per second) */ + uint64_t rate; + + /** Token bucket size (bytes), a.k.a. max burst size */ + uint64_t size; +}; + +/** + * Shaper (rate limiter) profile + * + * Multiple shaper instances can share the same shaper profile. Each node has + * zero or one private shaper (only one node using it) and/or zero, one or + * several shared shapers (multiple nodes use the same shaper instance). + * A private shaper is used to perform traffic shaping for a single node, while + * a shared shaper is used to perform traffic shaping for a group of nodes. + * + * Single rate shapers use a single token bucket. A single rate shaper can be + * configured by setting the rate of the committed bucket to zero, which + * effectively disables this bucket. The peak bucket is used to limit the rate + * and the burst size for the current shaper. + * + * Dual rate shapers use both the committed and the peak token buckets. The + * rate of the peak bucket has to be bigger than zero, as well as greater than + * or equal to the rate of the committed bucket. + */ +struct rte_tm_shaper_params { + /** Committed token bucket */ + struct rte_tm_token_bucket committed; + + /** Peak token bucket */ + struct rte_tm_token_bucket peak; + + /** Signed value to be added to the length of each packet for the + * purpose of shaping. Can be used to correct the packet length with + * the framing overhead bytes that are also consumed on the wire (e.g. + * RTE_TM_ETH_FRAMING_OVERHEAD_FCS). + */ + int32_t pkt_length_adjust; +}; + +/** + * Node parameters + * + * Each non-leaf node has multiple inputs (its children nodes) and single output + * (which is input to its parent node). It arbitrates its inputs using Strict + * Priority (SP) and Weighted Fair Queuing (WFQ) algorithms to schedule input + * packets to its output while observing its shaping (rate limiting) + * constraints. + * + * Algorithms such as Weighted Round Robin (WRR), Byte-level WRR, Deficit WRR + * (DWRR), etc. are considered approximations of the WFQ ideal and are + * assimilated to WFQ, although an associated implementation-dependent trade-off + * on accuracy, performance and resource usage might exist. + * + * Children nodes with different priorities are scheduled using the SP algorithm + * based on their priority, with zero (0) as the highest priority. Children with + * the same priority are scheduled using the WFQ algorithm according to their + * weights. The WFQ weight of a given child node is relative to the sum of the + * weights of all its sibling nodes that have the same priority, with one (1) as + * the lowest weight. For each SP priority, the WFQ weight mode can be set as + * either byte-based or packet-based. + * + * Each leaf node sits on top of a TX queue of the current Ethernet port. Hence, + * the leaf nodes are predefined, with their node IDs set to 0 .. (N-1), where N + * is the number of TX queues configured for the current Ethernet port. The + * non-leaf nodes have their IDs generated by the application. + */ +struct rte_tm_node_params { + /** Shaper profile for the private shaper. The absence of the private + * shaper for the current node is indicated by setting this parameter + * to RTE_TM_SHAPER_PROFILE_ID_NONE. + */ + uint32_t shaper_profile_id; + + /** User allocated array of valid shared shaper IDs. */ + uint32_t *shared_shaper_id; + + /** Number of shared shaper IDs in the *shared_shaper_id* array. */ + uint32_t n_shared_shapers; + + RTE_STD_C11 + union { + /** Parameters only valid for non-leaf nodes. */ + struct { + /** WFQ weight mode for each SP priority. When NULL, it + * indicates that WFQ is to be used for all priorities. + * When non-NULL, it points to a pre-allocated array of + * *n_sp_priorities* values, with non-zero value for + * byte-mode and zero for packet-mode. + */ + int *wfq_weight_mode; + + /** Number of SP priorities. */ + uint32_t n_sp_priorities; + } nonleaf; + + /** Parameters only valid for leaf nodes. */ + struct { + /** Congestion management mode */ + enum rte_tm_cman_mode cman; + + /** WRED parameters (only valid when *cman* is set to + * WRED). + */ + struct { + /** WRED profile for private WRED context. The + * absence of a private WRED context for the + * current leaf node is indicated by value + * RTE_TM_WRED_PROFILE_ID_NONE. + */ + uint32_t wred_profile_id; + + /** User allocated array of shared WRED context + * IDs. When set to NULL, it indicates that the + * current leaf node should not currently be + * part of any shared WRED contexts. + */ + uint32_t *shared_wred_context_id; + + /** Number of elements in the + * *shared_wred_context_id* array. Only valid + * when *shared_wred_context_id* is non-NULL, + * in which case it should be non-zero. + */ + uint32_t n_shared_wred_contexts; + } wred; + } leaf; + }; + + /** Mask of statistics counter types to be enabled for this node. This + * needs to be a subset of the statistics counter types available for + * the current node. Any statistics counter type not included in this + * set is to be disabled for the current node. + * @see enum rte_tm_stats_type + */ + uint64_t stats_mask; +}; + +/** + * Verbose error types. + * + * Most of them provide the type of the object referenced by struct + * rte_tm_error::cause. + */ +enum rte_tm_error_type { + RTE_TM_ERROR_TYPE_NONE, /**< No error. */ + RTE_TM_ERROR_TYPE_UNSPECIFIED, /**< Cause unspecified. */ + RTE_TM_ERROR_TYPE_CAPABILITIES, + RTE_TM_ERROR_TYPE_LEVEL_ID, + RTE_TM_ERROR_TYPE_WRED_PROFILE, + RTE_TM_ERROR_TYPE_WRED_PROFILE_GREEN, + RTE_TM_ERROR_TYPE_WRED_PROFILE_YELLOW, + RTE_TM_ERROR_TYPE_WRED_PROFILE_RED, + RTE_TM_ERROR_TYPE_WRED_PROFILE_ID, + RTE_TM_ERROR_TYPE_SHARED_WRED_CONTEXT_ID, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE_COMMITTED_RATE, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE_COMMITTED_SIZE, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE_PEAK_RATE, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE_PEAK_SIZE, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE_PKT_ADJUST_LEN, + RTE_TM_ERROR_TYPE_SHAPER_PROFILE_ID, + RTE_TM_ERROR_TYPE_SHARED_SHAPER_ID, + RTE_TM_ERROR_TYPE_NODE_PARENT_NODE_ID, + RTE_TM_ERROR_TYPE_NODE_PRIORITY, + RTE_TM_ERROR_TYPE_NODE_WEIGHT, + RTE_TM_ERROR_TYPE_NODE_PARAMS, + RTE_TM_ERROR_TYPE_NODE_PARAMS_SHAPER_PROFILE_ID, + RTE_TM_ERROR_TYPE_NODE_PARAMS_SHARED_SHAPER_ID, + RTE_TM_ERROR_TYPE_NODE_PARAMS_N_SHARED_SHAPERS, + RTE_TM_ERROR_TYPE_NODE_PARAMS_WFQ_WEIGHT_MODE, + RTE_TM_ERROR_TYPE_NODE_PARAMS_N_SP_PRIORITIES, + RTE_TM_ERROR_TYPE_NODE_PARAMS_CMAN, + RTE_TM_ERROR_TYPE_NODE_PARAMS_WRED_PROFILE_ID, + RTE_TM_ERROR_TYPE_NODE_PARAMS_SHARED_WRED_CONTEXT_ID, + RTE_TM_ERROR_TYPE_NODE_PARAMS_N_SHARED_WRED_CONTEXTS, + RTE_TM_ERROR_TYPE_NODE_PARAMS_STATS, + RTE_TM_ERROR_TYPE_NODE_ID, +}; + +/** + * Verbose error structure definition. + * + * This object is normally allocated by applications and set by PMDs, the + * message points to a constant string which does not need to be freed by + * the application, however its pointer can be considered valid only as long + * as its associated DPDK port remains configured. Closing the underlying + * device or unloading the PMD invalidates it. + * + * Both cause and message may be NULL regardless of the error type. + */ +struct rte_tm_error { + enum rte_tm_error_type type; /**< Cause field and error type. */ + const void *cause; /**< Object responsible for the error. */ + const char *message; /**< Human-readable error message. */ +}; + +/** + * Traffic manager get number of leaf nodes + * + * Each leaf node sits on on top of a TX queue of the current Ethernet port. + * Therefore, the set of leaf nodes is predefined, their number is always equal + * to N (where N is the number of TX queues configured for the current port) + * and their IDs are 0 .. (N-1). + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[out] n_leaf_nodes + * Number of leaf nodes for the current port. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int +rte_tm_get_number_of_leaf_nodes(uint16_t port_id, + uint32_t *n_leaf_nodes, + struct rte_tm_error *error); + +/** + * Traffic manager node ID validate and type (i.e. leaf or non-leaf) get + * + * The leaf nodes have predefined IDs in the range of 0 .. (N-1), where N is + * the number of TX queues of the current Ethernet port. The non-leaf nodes + * have their IDs generated by the application outside of the above range, + * which is reserved for leaf nodes. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID value. Needs to be valid. + * @param[out] is_leaf + * Set to non-zero value when node is leaf and to zero otherwise (non-leaf). + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int +rte_tm_node_type_get(uint16_t port_id, + uint32_t node_id, + int *is_leaf, + struct rte_tm_error *error); + +/** + * Traffic manager capabilities get + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[out] cap + * Traffic manager capabilities. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int +rte_tm_capabilities_get(uint16_t port_id, + struct rte_tm_capabilities *cap, + struct rte_tm_error *error); + +/** + * Traffic manager level capabilities get + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] level_id + * The hierarchy level identifier. The value of 0 identifies the level of the + * root node. + * @param[out] cap + * Traffic manager level capabilities. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int +rte_tm_level_capabilities_get(uint16_t port_id, + uint32_t level_id, + struct rte_tm_level_capabilities *cap, + struct rte_tm_error *error); + +/** + * Traffic manager node capabilities get + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[out] cap + * Traffic manager node capabilities. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + */ +int +rte_tm_node_capabilities_get(uint16_t port_id, + uint32_t node_id, + struct rte_tm_node_capabilities *cap, + struct rte_tm_error *error); + +/** + * Traffic manager WRED profile add + * + * Create a new WRED profile with ID set to *wred_profile_id*. The new profile + * is used to create one or several WRED contexts. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] wred_profile_id + * WRED profile ID for the new profile. Needs to be unused. + * @param[in] profile + * WRED profile parameters. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::cman_wred_context_n_max + */ +int +rte_tm_wred_profile_add(uint16_t port_id, + uint32_t wred_profile_id, + struct rte_tm_wred_params *profile, + struct rte_tm_error *error); + +/** + * Traffic manager WRED profile delete + * + * Delete an existing WRED profile. This operation fails when there is + * currently at least one user (i.e. WRED context) of this WRED profile. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] wred_profile_id + * WRED profile ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::cman_wred_context_n_max + */ +int +rte_tm_wred_profile_delete(uint16_t port_id, + uint32_t wred_profile_id, + struct rte_tm_error *error); + +/** + * Traffic manager shared WRED context add or update + * + * When *shared_wred_context_id* is invalid, a new WRED context with this ID is + * created by using the WRED profile identified by *wred_profile_id*. + * + * When *shared_wred_context_id* is valid, this WRED context is no longer using + * the profile previously assigned to it and is updated to use the profile + * identified by *wred_profile_id*. + * + * A valid shared WRED context can be assigned to several hierarchy leaf nodes + * configured to use WRED as the congestion management mode. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] shared_wred_context_id + * Shared WRED context ID + * @param[in] wred_profile_id + * WRED profile ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::cman_wred_context_shared_n_max + */ +int +rte_tm_shared_wred_context_add_update(uint16_t port_id, + uint32_t shared_wred_context_id, + uint32_t wred_profile_id, + struct rte_tm_error *error); + +/** + * Traffic manager shared WRED context delete + * + * Delete an existing shared WRED context. This operation fails when there is + * currently at least one user (i.e. hierarchy leaf node) of this shared WRED + * context. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] shared_wred_context_id + * Shared WRED context ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::cman_wred_context_shared_n_max + */ +int +rte_tm_shared_wred_context_delete(uint16_t port_id, + uint32_t shared_wred_context_id, + struct rte_tm_error *error); + +/** + * Traffic manager shaper profile add + * + * Create a new shaper profile with ID set to *shaper_profile_id*. The new + * shaper profile is used to create one or several shapers. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] shaper_profile_id + * Shaper profile ID for the new profile. Needs to be unused. + * @param[in] profile + * Shaper profile parameters. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::shaper_n_max + */ +int +rte_tm_shaper_profile_add(uint16_t port_id, + uint32_t shaper_profile_id, + struct rte_tm_shaper_params *profile, + struct rte_tm_error *error); + +/** + * Traffic manager shaper profile delete + * + * Delete an existing shaper profile. This operation fails when there is + * currently at least one user (i.e. shaper) of this shaper profile. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] shaper_profile_id + * Shaper profile ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::shaper_n_max + */ +int +rte_tm_shaper_profile_delete(uint16_t port_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error); + +/** + * Traffic manager shared shaper add or update + * + * When *shared_shaper_id* is not a valid shared shaper ID, a new shared shaper + * with this ID is created using the shaper profile identified by + * *shaper_profile_id*. + * + * When *shared_shaper_id* is a valid shared shaper ID, this shared shaper is + * no longer using the shaper profile previously assigned to it and is updated + * to use the shaper profile identified by *shaper_profile_id*. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] shared_shaper_id + * Shared shaper ID + * @param[in] shaper_profile_id + * Shaper profile ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::shaper_shared_n_max + */ +int +rte_tm_shared_shaper_add_update(uint16_t port_id, + uint32_t shared_shaper_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error); + +/** + * Traffic manager shared shaper delete + * + * Delete an existing shared shaper. This operation fails when there is + * currently at least one user (i.e. hierarchy node) of this shared shaper. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] shared_shaper_id + * Shared shaper ID. Needs to be the valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::shaper_shared_n_max + */ +int +rte_tm_shared_shaper_delete(uint16_t port_id, + uint32_t shared_shaper_id, + struct rte_tm_error *error); + +/** + * Traffic manager node add + * + * Create new node and connect it as child of an existing node. The new node is + * further identified by *node_id*, which needs to be unused by any of the + * existing nodes. The parent node is identified by *parent_node_id*, which + * needs to be the valid ID of an existing non-leaf node. The parent node is + * going to use the provided SP *priority* and WFQ *weight* to schedule its new + * child node. + * + * This function has to be called for both leaf and non-leaf nodes. In the case + * of leaf nodes (i.e. *node_id* is within the range of 0 .. (N-1), with N as + * the number of configured TX queues of the current port), the leaf node is + * configured rather than created (as the set of leaf nodes is predefined) and + * it is also connected as child of an existing node. + * + * The first node that is added becomes the root node and all the nodes that + * are subsequently added have to be added as descendants of the root node. The + * parent of the root node has to be specified as RTE_TM_NODE_ID_NULL and there + * can only be one node with this parent ID (i.e. the root node). Further + * restrictions for root node: needs to be non-leaf, its private shaper profile + * needs to be valid and single rate, cannot use any shared shapers. + * + * When called before rte_tm_hierarchy_commit() invocation, this function is + * typically used to define the initial start-up hierarchy for the port. + * Provided that dynamic hierarchy updates are supported by the current port (as + * advertised in the port capability set), this function can be also called + * after the rte_tm_hierarchy_commit() invocation. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be unused by any of the existing nodes. + * @param[in] parent_node_id + * Parent node ID. Needs to be the valid. + * @param[in] priority + * Node priority. The highest node priority is zero. Used by the SP algorithm + * running on the parent of the current node for scheduling this child node. + * @param[in] weight + * Node weight. The node weight is relative to the weight sum of all siblings + * that have the same priority. The lowest weight is one. Used by the WFQ + * algorithm running on the parent of the current node for scheduling this + * child node. + * @param[in] level_id + * Level ID that should be met by this node. The hierarchy level of the + * current node is already fully specified through its parent node (i.e. the + * level of this node is equal to the level of its parent node plus one), + * therefore the reason for providing this parameter is to enable the + * application to perform step-by-step checking of the node level during + * successive invocations of this function. When not desired, this check can + * be disabled by assigning value RTE_TM_NODE_LEVEL_ID_ANY to this parameter. + * @param[in] params + * Node parameters. Needs to be pre-allocated and valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see rte_tm_hierarchy_commit() + * @see RTE_TM_UPDATE_NODE_ADD_DELETE + * @see RTE_TM_NODE_LEVEL_ID_ANY + * @see struct rte_tm_capabilities + */ +int +rte_tm_node_add(uint16_t port_id, + uint32_t node_id, + uint32_t parent_node_id, + uint32_t priority, + uint32_t weight, + uint32_t level_id, + struct rte_tm_node_params *params, + struct rte_tm_error *error); + +/** + * Traffic manager node delete + * + * Delete an existing node. This operation fails when this node currently has + * at least one user (i.e. child node). + * + * When called before rte_tm_hierarchy_commit() invocation, this function is + * typically used to define the initial start-up hierarchy for the port. + * Provided that dynamic hierarchy updates are supported by the current port (as + * advertised in the port capability set), this function can be also called + * after the rte_tm_hierarchy_commit() invocation. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see RTE_TM_UPDATE_NODE_ADD_DELETE + */ +int +rte_tm_node_delete(uint16_t port_id, + uint32_t node_id, + struct rte_tm_error *error); + +/** + * Traffic manager node suspend + * + * Suspend an existing node. While the node is in suspended state, no packet is + * scheduled from this node and its descendants. The node exits the suspended + * state through the node resume operation. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see rte_tm_node_resume() + * @see RTE_TM_UPDATE_NODE_SUSPEND_RESUME + */ +int +rte_tm_node_suspend(uint16_t port_id, + uint32_t node_id, + struct rte_tm_error *error); + +/** + * Traffic manager node resume + * + * Resume an existing node that is currently in suspended state. The node + * entered the suspended state as result of a previous node suspend operation. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see rte_tm_node_suspend() + * @see RTE_TM_UPDATE_NODE_SUSPEND_RESUME + */ +int +rte_tm_node_resume(uint16_t port_id, + uint32_t node_id, + struct rte_tm_error *error); + +/** + * Traffic manager hierarchy commit + * + * This function is called during the port initialization phase (before the + * Ethernet port is started) to freeze the start-up hierarchy. + * + * This function typically performs the following steps: + * a) It validates the start-up hierarchy that was previously defined for the + * current port through successive rte_tm_node_add() invocations; + * b) Assuming successful validation, it performs all the necessary port + * specific configuration operations to install the specified hierarchy on + * the current port, with immediate effect once the port is started. + * + * This function fails when the currently configured hierarchy is not supported + * by the Ethernet port, in which case the user can abort or try out another + * hierarchy configuration (e.g. a hierarchy with less leaf nodes), which can be + * build from scratch (when *clear_on_fail* is enabled) or by modifying the + * existing hierarchy configuration (when *clear_on_fail* is disabled). + * + * Note that this function can still fail due to other causes (e.g. not enough + * memory available in the system, etc), even though the specified hierarchy is + * supported in principle by the current port. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] clear_on_fail + * On function call failure, hierarchy is cleared when this parameter is + * non-zero and preserved when this parameter is equal to zero. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see rte_tm_node_add() + * @see rte_tm_node_delete() + */ +int +rte_tm_hierarchy_commit(uint16_t port_id, + int clear_on_fail, + struct rte_tm_error *error); + +/** + * Traffic manager node parent update + * + * This function may be used to move a node and its children to a different + * parent. Additionally, if the new parent is the same as the current parent, + * this function will update the priority/weight of an existing node. + * + * Restriction for root node: its parent cannot be changed. + * + * This function can only be called after the rte_tm_hierarchy_commit() + * invocation. Its success depends on the port support for this operation, as + * advertised through the port capability set. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[in] parent_node_id + * Node ID for the new parent. Needs to be valid. + * @param[in] priority + * Node priority. The highest node priority is zero. Used by the SP algorithm + * running on the parent of the current node for scheduling this child node. + * @param[in] weight + * Node weight. The node weight is relative to the weight sum of all siblings + * that have the same priority. The lowest weight is zero. Used by the WFQ + * algorithm running on the parent of the current node for scheduling this + * child node. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see RTE_TM_UPDATE_NODE_PARENT_KEEP_LEVEL + * @see RTE_TM_UPDATE_NODE_PARENT_CHANGE_LEVEL + */ +int +rte_tm_node_parent_update(uint16_t port_id, + uint32_t node_id, + uint32_t parent_node_id, + uint32_t priority, + uint32_t weight, + struct rte_tm_error *error); + +/** + * Traffic manager node private shaper update + * + * Restriction for the root node: its private shaper profile needs to be valid + * and single rate. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[in] shaper_profile_id + * Shaper profile ID for the private shaper of the current node. Needs to be + * either valid shaper profile ID or RTE_TM_SHAPER_PROFILE_ID_NONE, with + * the latter disabling the private shaper of the current node. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::shaper_private_n_max + */ +int +rte_tm_node_shaper_update(uint16_t port_id, + uint32_t node_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error); + +/** + * Traffic manager node shared shapers update + * + * Restriction for root node: cannot use any shared rate shapers. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[in] shared_shaper_id + * Shared shaper ID. Needs to be valid. + * @param[in] add + * Set to non-zero value to add this shared shaper to current node or to zero + * to delete this shared shaper from current node. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::shaper_shared_n_max + */ +int +rte_tm_node_shared_shaper_update(uint16_t port_id, + uint32_t node_id, + uint32_t shared_shaper_id, + int add, + struct rte_tm_error *error); + +/** + * Traffic manager node enabled statistics counters update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[in] stats_mask + * Mask of statistics counter types to be enabled for the current node. This + * needs to be a subset of the statistics counter types available for the + * current node. Any statistics counter type not included in this set is to + * be disabled for the current node. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see enum rte_tm_stats_type + * @see RTE_TM_UPDATE_NODE_STATS + */ +int +rte_tm_node_stats_update(uint16_t port_id, + uint32_t node_id, + uint64_t stats_mask, + struct rte_tm_error *error); + +/** + * Traffic manager node WFQ weight mode update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid non-leaf node ID. + * @param[in] wfq_weight_mode + * WFQ weight mode for each SP priority. When NULL, it indicates that WFQ is + * to be used for all priorities. When non-NULL, it points to a pre-allocated + * array of *n_sp_priorities* values, with non-zero value for byte-mode and + * zero for packet-mode. + * @param[in] n_sp_priorities + * Number of SP priorities. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see RTE_TM_UPDATE_NODE_WFQ_WEIGHT_MODE + * @see RTE_TM_UPDATE_NODE_N_SP_PRIORITIES + */ +int +rte_tm_node_wfq_weight_mode_update(uint16_t port_id, + uint32_t node_id, + int *wfq_weight_mode, + uint32_t n_sp_priorities, + struct rte_tm_error *error); + +/** + * Traffic manager node congestion management mode update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid leaf node ID. + * @param[in] cman + * Congestion management mode. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see RTE_TM_UPDATE_NODE_CMAN + */ +int +rte_tm_node_cman_update(uint16_t port_id, + uint32_t node_id, + enum rte_tm_cman_mode cman, + struct rte_tm_error *error); + +/** + * Traffic manager node private WRED context update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid leaf node ID. + * @param[in] wred_profile_id + * WRED profile ID for the private WRED context of the current node. Needs to + * be either valid WRED profile ID or RTE_TM_WRED_PROFILE_ID_NONE, with the + * latter disabling the private WRED context of the current node. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::cman_wred_context_private_n_max +*/ +int +rte_tm_node_wred_context_update(uint16_t port_id, + uint32_t node_id, + uint32_t wred_profile_id, + struct rte_tm_error *error); + +/** + * Traffic manager node shared WRED context update + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid leaf node ID. + * @param[in] shared_wred_context_id + * Shared WRED context ID. Needs to be valid. + * @param[in] add + * Set to non-zero value to add this shared WRED context to current node or + * to zero to delete this shared WRED context from current node. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::cman_wred_context_shared_n_max + */ +int +rte_tm_node_shared_wred_context_update(uint16_t port_id, + uint32_t node_id, + uint32_t shared_wred_context_id, + int add, + struct rte_tm_error *error); + +/** + * Traffic manager node statistics counters read + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] node_id + * Node ID. Needs to be valid. + * @param[out] stats + * When non-NULL, it contains the current value for the statistics counters + * enabled for the current node. + * @param[out] stats_mask + * When non-NULL, it contains the mask of statistics counter types that are + * currently enabled for this node, indicating which of the counters + * retrieved with the *stats* structure are valid. + * @param[in] clear + * When this parameter has a non-zero value, the statistics counters are + * cleared (i.e. set to zero) immediately after they have been read, + * otherwise the statistics counters are left untouched. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see enum rte_tm_stats_type + */ +int +rte_tm_node_stats_read(uint16_t port_id, + uint32_t node_id, + struct rte_tm_node_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_tm_error *error); + +/** + * Traffic manager packet marking - VLAN DEI (IEEE 802.1Q) + * + * IEEE 802.1p maps the traffic class to the VLAN Priority Code Point (PCP) + * field (3 bits), while IEEE 802.1q maps the drop priority to the VLAN Drop + * Eligible Indicator (DEI) field (1 bit), which was previously named Canonical + * Format Indicator (CFI). + * + * All VLAN frames of a given color get their DEI bit set if marking is enabled + * for this color; otherwise, their DEI bit is left as is (either set or not). + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mark_green + * Set to non-zero value to enable marking of green packets and to zero to + * disable it. + * @param[in] mark_yellow + * Set to non-zero value to enable marking of yellow packets and to zero to + * disable it. + * @param[in] mark_red + * Set to non-zero value to enable marking of red packets and to zero to + * disable it. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::mark_vlan_dei_supported + */ +int +rte_tm_mark_vlan_dei(uint16_t port_id, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error); + +/** + * Traffic manager packet marking - IPv4 / IPv6 ECN (IETF RFC 3168) + * + * IETF RFCs 2474 and 3168 reorganize the IPv4 Type of Service (TOS) field + * (8 bits) and the IPv6 Traffic Class (TC) field (8 bits) into Differentiated + * Services Codepoint (DSCP) field (6 bits) and Explicit Congestion + * Notification (ECN) field (2 bits). The DSCP field is typically used to + * encode the traffic class and/or drop priority (RFC 2597), while the ECN + * field is used by RFC 3168 to implement a congestion notification mechanism + * to be leveraged by transport layer protocols such as TCP and SCTP that have + * congestion control mechanisms. + * + * When congestion is experienced, as alternative to dropping the packet, + * routers can change the ECN field of input packets from 2'b01 or 2'b10 + * (values indicating that source endpoint is ECN-capable) to 2'b11 (meaning + * that congestion is experienced). The destination endpoint can use the + * ECN-Echo (ECE) TCP flag to relay the congestion indication back to the + * source endpoint, which acknowledges it back to the destination endpoint with + * the Congestion Window Reduced (CWR) TCP flag. + * + * All IPv4/IPv6 packets of a given color with ECN set to 2’b01 or 2’b10 + * carrying TCP or SCTP have their ECN set to 2’b11 if the marking feature is + * enabled for the current color, otherwise the ECN field is left as is. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mark_green + * Set to non-zero value to enable marking of green packets and to zero to + * disable it. + * @param[in] mark_yellow + * Set to non-zero value to enable marking of yellow packets and to zero to + * disable it. + * @param[in] mark_red + * Set to non-zero value to enable marking of red packets and to zero to + * disable it. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::mark_ip_ecn_tcp_supported + * @see struct rte_tm_capabilities::mark_ip_ecn_sctp_supported + */ +int +rte_tm_mark_ip_ecn(uint16_t port_id, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error); + +/** + * Traffic manager packet marking - IPv4 / IPv6 DSCP (IETF RFC 2597) + * + * IETF RFC 2597 maps the traffic class and the drop priority to the IPv4/IPv6 + * Differentiated Services Codepoint (DSCP) field (6 bits). Here are the DSCP + * values proposed by this RFC: + * + *
                   Class 1    Class 2    Class 3    Class 4   
+ *
                 +----------+----------+----------+----------+
+ *
Low Drop Prec    |  001010  |  010010  |  011010  |  100010  |
+ *
Medium Drop Prec |  001100  |  010100  |  011100  |  100100  |
+ *
High Drop Prec   |  001110  |  010110  |  011110  |  100110  |
+ *
                 +----------+----------+----------+----------+
+ * + * There are 4 traffic classes (classes 1 .. 4) encoded by DSCP bits 1 and 2, + * as well as 3 drop priorities (low/medium/high) encoded by DSCP bits 3 and 4. + * + * All IPv4/IPv6 packets have their color marked into DSCP bits 3 and 4 as + * follows: green mapped to Low Drop Precedence (2’b01), yellow to Medium + * (2’b10) and red to High (2’b11). Marking needs to be explicitly enabled + * for each color; when not enabled for a given color, the DSCP field of all + * packets with that color is left as is. + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[in] mark_green + * Set to non-zero value to enable marking of green packets and to zero to + * disable it. + * @param[in] mark_yellow + * Set to non-zero value to enable marking of yellow packets and to zero to + * disable it. + * @param[in] mark_red + * Set to non-zero value to enable marking of red packets and to zero to + * disable it. + * @param[out] error + * Error details. Filled in only on error, when not NULL. + * @return + * 0 on success, non-zero error code otherwise. + * + * @see struct rte_tm_capabilities::mark_ip_dscp_supported + */ +int +rte_tm_mark_ip_dscp(uint16_t port_id, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_TM_H__ */ diff --git a/src/spdk/dpdk/lib/librte_ethdev/rte_tm_driver.h b/src/spdk/dpdk/lib/librte_ethdev/rte_tm_driver.h new file mode 100644 index 00000000..90114ff5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ethdev/rte_tm_driver.h @@ -0,0 +1,337 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TM_DRIVER_H__ +#define __INCLUDE_RTE_TM_DRIVER_H__ + +/** + * @file + * RTE Generic Traffic Manager API (Driver Side) + * + * This file provides implementation helpers for internal use by PMDs, they + * are not intended to be exposed to applications and are not subject to ABI + * versioning. + */ + +#include + +#include +#include "rte_ethdev.h" +#include "rte_tm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @internal Traffic manager node ID validate and type get */ +typedef int (*rte_tm_node_type_get_t)(struct rte_eth_dev *dev, + uint32_t node_id, + int *is_leaf, + struct rte_tm_error *error); + +/** @internal Traffic manager capabilities get */ +typedef int (*rte_tm_capabilities_get_t)(struct rte_eth_dev *dev, + struct rte_tm_capabilities *cap, + struct rte_tm_error *error); + +/** @internal Traffic manager level capabilities get */ +typedef int (*rte_tm_level_capabilities_get_t)(struct rte_eth_dev *dev, + uint32_t level_id, + struct rte_tm_level_capabilities *cap, + struct rte_tm_error *error); + +/** @internal Traffic manager node capabilities get */ +typedef int (*rte_tm_node_capabilities_get_t)(struct rte_eth_dev *dev, + uint32_t node_id, + struct rte_tm_node_capabilities *cap, + struct rte_tm_error *error); + +/** @internal Traffic manager WRED profile add */ +typedef int (*rte_tm_wred_profile_add_t)(struct rte_eth_dev *dev, + uint32_t wred_profile_id, + struct rte_tm_wred_params *profile, + struct rte_tm_error *error); + +/** @internal Traffic manager WRED profile delete */ +typedef int (*rte_tm_wred_profile_delete_t)(struct rte_eth_dev *dev, + uint32_t wred_profile_id, + struct rte_tm_error *error); + +/** @internal Traffic manager shared WRED context add */ +typedef int (*rte_tm_shared_wred_context_add_update_t)( + struct rte_eth_dev *dev, + uint32_t shared_wred_context_id, + uint32_t wred_profile_id, + struct rte_tm_error *error); + +/** @internal Traffic manager shared WRED context delete */ +typedef int (*rte_tm_shared_wred_context_delete_t)( + struct rte_eth_dev *dev, + uint32_t shared_wred_context_id, + struct rte_tm_error *error); + +/** @internal Traffic manager shaper profile add */ +typedef int (*rte_tm_shaper_profile_add_t)(struct rte_eth_dev *dev, + uint32_t shaper_profile_id, + struct rte_tm_shaper_params *profile, + struct rte_tm_error *error); + +/** @internal Traffic manager shaper profile delete */ +typedef int (*rte_tm_shaper_profile_delete_t)(struct rte_eth_dev *dev, + uint32_t shaper_profile_id, + struct rte_tm_error *error); + +/** @internal Traffic manager shared shaper add/update */ +typedef int (*rte_tm_shared_shaper_add_update_t)(struct rte_eth_dev *dev, + uint32_t shared_shaper_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error); + +/** @internal Traffic manager shared shaper delete */ +typedef int (*rte_tm_shared_shaper_delete_t)(struct rte_eth_dev *dev, + uint32_t shared_shaper_id, + struct rte_tm_error *error); + +/** @internal Traffic manager node add */ +typedef int (*rte_tm_node_add_t)(struct rte_eth_dev *dev, + uint32_t node_id, + uint32_t parent_node_id, + uint32_t priority, + uint32_t weight, + uint32_t level_id, + struct rte_tm_node_params *params, + struct rte_tm_error *error); + +/** @internal Traffic manager node delete */ +typedef int (*rte_tm_node_delete_t)(struct rte_eth_dev *dev, + uint32_t node_id, + struct rte_tm_error *error); + +/** @internal Traffic manager node suspend */ +typedef int (*rte_tm_node_suspend_t)(struct rte_eth_dev *dev, + uint32_t node_id, + struct rte_tm_error *error); + +/** @internal Traffic manager node resume */ +typedef int (*rte_tm_node_resume_t)(struct rte_eth_dev *dev, + uint32_t node_id, + struct rte_tm_error *error); + +/** @internal Traffic manager hierarchy commit */ +typedef int (*rte_tm_hierarchy_commit_t)(struct rte_eth_dev *dev, + int clear_on_fail, + struct rte_tm_error *error); + +/** @internal Traffic manager node parent update */ +typedef int (*rte_tm_node_parent_update_t)(struct rte_eth_dev *dev, + uint32_t node_id, + uint32_t parent_node_id, + uint32_t priority, + uint32_t weight, + struct rte_tm_error *error); + +/** @internal Traffic manager node shaper update */ +typedef int (*rte_tm_node_shaper_update_t)(struct rte_eth_dev *dev, + uint32_t node_id, + uint32_t shaper_profile_id, + struct rte_tm_error *error); + +/** @internal Traffic manager node shaper update */ +typedef int (*rte_tm_node_shared_shaper_update_t)(struct rte_eth_dev *dev, + uint32_t node_id, + uint32_t shared_shaper_id, + int32_t add, + struct rte_tm_error *error); + +/** @internal Traffic manager node stats update */ +typedef int (*rte_tm_node_stats_update_t)(struct rte_eth_dev *dev, + uint32_t node_id, + uint64_t stats_mask, + struct rte_tm_error *error); + +/** @internal Traffic manager node WFQ weight mode update */ +typedef int (*rte_tm_node_wfq_weight_mode_update_t)( + struct rte_eth_dev *dev, + uint32_t node_id, + int *wfq_weight_mode, + uint32_t n_sp_priorities, + struct rte_tm_error *error); + +/** @internal Traffic manager node congestion management mode update */ +typedef int (*rte_tm_node_cman_update_t)(struct rte_eth_dev *dev, + uint32_t node_id, + enum rte_tm_cman_mode cman, + struct rte_tm_error *error); + +/** @internal Traffic manager node WRED context update */ +typedef int (*rte_tm_node_wred_context_update_t)( + struct rte_eth_dev *dev, + uint32_t node_id, + uint32_t wred_profile_id, + struct rte_tm_error *error); + +/** @internal Traffic manager node WRED context update */ +typedef int (*rte_tm_node_shared_wred_context_update_t)( + struct rte_eth_dev *dev, + uint32_t node_id, + uint32_t shared_wred_context_id, + int add, + struct rte_tm_error *error); + +/** @internal Traffic manager read stats counters for specific node */ +typedef int (*rte_tm_node_stats_read_t)(struct rte_eth_dev *dev, + uint32_t node_id, + struct rte_tm_node_stats *stats, + uint64_t *stats_mask, + int clear, + struct rte_tm_error *error); + +/** @internal Traffic manager packet marking - VLAN DEI */ +typedef int (*rte_tm_mark_vlan_dei_t)(struct rte_eth_dev *dev, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error); + +/** @internal Traffic manager packet marking - IPv4/IPv6 ECN */ +typedef int (*rte_tm_mark_ip_ecn_t)(struct rte_eth_dev *dev, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error); + +/** @internal Traffic manager packet marking - IPv4/IPv6 DSCP */ +typedef int (*rte_tm_mark_ip_dscp_t)(struct rte_eth_dev *dev, + int mark_green, + int mark_yellow, + int mark_red, + struct rte_tm_error *error); + +struct rte_tm_ops { + /** Traffic manager node type get */ + rte_tm_node_type_get_t node_type_get; + + /** Traffic manager capabilities_get */ + rte_tm_capabilities_get_t capabilities_get; + /** Traffic manager level capabilities_get */ + rte_tm_level_capabilities_get_t level_capabilities_get; + /** Traffic manager node capabilities get */ + rte_tm_node_capabilities_get_t node_capabilities_get; + + /** Traffic manager WRED profile add */ + rte_tm_wred_profile_add_t wred_profile_add; + /** Traffic manager WRED profile delete */ + rte_tm_wred_profile_delete_t wred_profile_delete; + /** Traffic manager shared WRED context add/update */ + rte_tm_shared_wred_context_add_update_t + shared_wred_context_add_update; + /** Traffic manager shared WRED context delete */ + rte_tm_shared_wred_context_delete_t + shared_wred_context_delete; + + /** Traffic manager shaper profile add */ + rte_tm_shaper_profile_add_t shaper_profile_add; + /** Traffic manager shaper profile delete */ + rte_tm_shaper_profile_delete_t shaper_profile_delete; + /** Traffic manager shared shaper add/update */ + rte_tm_shared_shaper_add_update_t shared_shaper_add_update; + /** Traffic manager shared shaper delete */ + rte_tm_shared_shaper_delete_t shared_shaper_delete; + + /** Traffic manager node add */ + rte_tm_node_add_t node_add; + /** Traffic manager node delete */ + rte_tm_node_delete_t node_delete; + /** Traffic manager node suspend */ + rte_tm_node_suspend_t node_suspend; + /** Traffic manager node resume */ + rte_tm_node_resume_t node_resume; + /** Traffic manager hierarchy commit */ + rte_tm_hierarchy_commit_t hierarchy_commit; + + /** Traffic manager node parent update */ + rte_tm_node_parent_update_t node_parent_update; + /** Traffic manager node shaper update */ + rte_tm_node_shaper_update_t node_shaper_update; + /** Traffic manager node shared shaper update */ + rte_tm_node_shared_shaper_update_t node_shared_shaper_update; + /** Traffic manager node stats update */ + rte_tm_node_stats_update_t node_stats_update; + /** Traffic manager node WFQ weight mode update */ + rte_tm_node_wfq_weight_mode_update_t node_wfq_weight_mode_update; + /** Traffic manager node congestion management mode update */ + rte_tm_node_cman_update_t node_cman_update; + /** Traffic manager node WRED context update */ + rte_tm_node_wred_context_update_t node_wred_context_update; + /** Traffic manager node shared WRED context update */ + rte_tm_node_shared_wred_context_update_t + node_shared_wred_context_update; + /** Traffic manager read statistics counters for current node */ + rte_tm_node_stats_read_t node_stats_read; + + /** Traffic manager packet marking - VLAN DEI */ + rte_tm_mark_vlan_dei_t mark_vlan_dei; + /** Traffic manager packet marking - IPv4/IPv6 ECN */ + rte_tm_mark_ip_ecn_t mark_ip_ecn; + /** Traffic manager packet marking - IPv4/IPv6 DSCP */ + rte_tm_mark_ip_dscp_t mark_ip_dscp; +}; + +/** + * Initialize generic error structure. + * + * This function also sets rte_errno to a given value. + * + * @param[out] error + * Pointer to error structure (may be NULL). + * @param[in] code + * Related error code (rte_errno). + * @param[in] type + * Cause field and error type. + * @param[in] cause + * Object responsible for the error. + * @param[in] message + * Human-readable error message. + * + * @return + * Error code. + */ +static inline int +rte_tm_error_set(struct rte_tm_error *error, + int code, + enum rte_tm_error_type type, + const void *cause, + const char *message) +{ + if (error) { + *error = (struct rte_tm_error){ + .type = type, + .cause = cause, + .message = message, + }; + } + rte_errno = code; + return code; +} + +/** + * Get generic traffic manager operations structure from a port + * + * @param[in] port_id + * The port identifier of the Ethernet device. + * @param[out] error + * Error details + * + * @return + * The traffic manager operations structure associated with port_id on + * success, NULL otherwise. + */ +const struct rte_tm_ops * +rte_tm_ops_get(uint16_t port_id, struct rte_tm_error *error); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_TM_DRIVER_H__ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/Makefile b/src/spdk/dpdk/lib/librte_eventdev/Makefile new file mode 100644 index 00000000..47f599a6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/Makefile @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2016 Cavium, Inc +# + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_eventdev.a + +# library version +LIBABIVER := 5 + +# build flags +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) +CFLAGS += -DLINUX +else +CFLAGS += -DBSD +endif +LDLIBS += -lrte_eal -lrte_ring -lrte_ethdev -lrte_hash -lrte_mempool -lrte_timer +LDLIBS += -lrte_mbuf -lrte_cryptodev -lpthread + +# library source files +SRCS-y += rte_eventdev.c +SRCS-y += rte_event_ring.c +SRCS-y += rte_event_eth_rx_adapter.c +SRCS-y += rte_event_timer_adapter.c +SRCS-y += rte_event_crypto_adapter.c + +# export include files +SYMLINK-y-include += rte_eventdev.h +SYMLINK-y-include += rte_eventdev_pmd.h +SYMLINK-y-include += rte_eventdev_pmd_pci.h +SYMLINK-y-include += rte_eventdev_pmd_vdev.h +SYMLINK-y-include += rte_event_ring.h +SYMLINK-y-include += rte_event_eth_rx_adapter.h +SYMLINK-y-include += rte_event_timer_adapter.h +SYMLINK-y-include += rte_event_timer_adapter_pmd.h +SYMLINK-y-include += rte_event_crypto_adapter.h + +# versioning export map +EXPORT_MAP := rte_eventdev_version.map + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_eventdev/meson.build b/src/spdk/dpdk/lib/librte_eventdev/meson.build new file mode 100644 index 00000000..3cbaf298 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/meson.build @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 5 +allow_experimental_apis = true + +if host_machine.system() == 'linux' + cflags += '-DLINUX' +else + cflags += '-DBSD' +endif + +sources = files('rte_eventdev.c', + 'rte_event_ring.c', + 'rte_event_eth_rx_adapter.c', + 'rte_event_timer_adapter.c', + 'rte_event_crypto_adapter.c') +headers = files('rte_eventdev.h', + 'rte_eventdev_pmd.h', + 'rte_eventdev_pmd_pci.h', + 'rte_eventdev_pmd_vdev.h', + 'rte_event_ring.h', + 'rte_event_eth_rx_adapter.h', + 'rte_event_timer_adapter.h', + 'rte_event_timer_adapter_pmd.h', + 'rte_event_crypto_adapter.h') +deps += ['ring', 'ethdev', 'hash', 'mempool', 'mbuf', 'timer', 'cryptodev'] diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.c b/src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.c new file mode 100644 index 00000000..11b28ca9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.c @@ -0,0 +1,1128 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_eventdev.h" +#include "rte_eventdev_pmd.h" +#include "rte_event_crypto_adapter.h" + +#define BATCH_SIZE 32 +#define DEFAULT_MAX_NB 128 +#define CRYPTO_ADAPTER_NAME_LEN 32 +#define CRYPTO_ADAPTER_MEM_NAME_LEN 32 +#define CRYPTO_ADAPTER_MAX_EV_ENQ_RETRIES 100 + +/* Flush an instance's enqueue buffers every CRYPTO_ENQ_FLUSH_THRESHOLD + * iterations of eca_crypto_adapter_enq_run() + */ +#define CRYPTO_ENQ_FLUSH_THRESHOLD 1024 + +struct rte_event_crypto_adapter { + /* Event device identifier */ + uint8_t eventdev_id; + /* Event port identifier */ + uint8_t event_port_id; + /* Store event device's implicit release capability */ + uint8_t implicit_release_disabled; + /* Max crypto ops processed in any service function invocation */ + uint32_t max_nb; + /* Lock to serialize config updates with service function */ + rte_spinlock_t lock; + /* Next crypto device to be processed */ + uint16_t next_cdev_id; + /* Per crypto device structure */ + struct crypto_device_info *cdevs; + /* Loop counter to flush crypto ops */ + uint16_t transmit_loop_count; + /* Per instance stats structure */ + struct rte_event_crypto_adapter_stats crypto_stats; + /* Configuration callback for rte_service configuration */ + rte_event_crypto_adapter_conf_cb conf_cb; + /* Configuration callback argument */ + void *conf_arg; + /* Set if default_cb is being used */ + int default_cb_arg; + /* Service initialization state */ + uint8_t service_inited; + /* Memory allocation name */ + char mem_name[CRYPTO_ADAPTER_MEM_NAME_LEN]; + /* Socket identifier cached from eventdev */ + int socket_id; + /* Per adapter EAL service */ + uint32_t service_id; + /* No. of queue pairs configured */ + uint16_t nb_qps; + /* Adapter mode */ + enum rte_event_crypto_adapter_mode mode; +} __rte_cache_aligned; + +/* Per crypto device information */ +struct crypto_device_info { + /* Pointer to cryptodev */ + struct rte_cryptodev *dev; + /* Pointer to queue pair info */ + struct crypto_queue_pair_info *qpairs; + /* Next queue pair to be processed */ + uint16_t next_queue_pair_id; + /* Set to indicate cryptodev->eventdev packet + * transfer uses a hardware mechanism + */ + uint8_t internal_event_port; + /* Set to indicate processing has been started */ + uint8_t dev_started; + /* If num_qpairs > 0, the start callback will + * be invoked if not already invoked + */ + uint16_t num_qpairs; +} __rte_cache_aligned; + +/* Per queue pair information */ +struct crypto_queue_pair_info { + /* Set to indicate queue pair is enabled */ + bool qp_enabled; + /* Pointer to hold rte_crypto_ops for batching */ + struct rte_crypto_op **op_buffer; + /* No of crypto ops accumulated */ + uint8_t len; +} __rte_cache_aligned; + +static struct rte_event_crypto_adapter **event_crypto_adapter; + +/* Macros to check for valid adapter */ +#define EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, retval) do { \ + if (!eca_valid_id(id)) { \ + RTE_EDEV_LOG_ERR("Invalid crypto adapter id = %d\n", id); \ + return retval; \ + } \ +} while (0) + +static inline int +eca_valid_id(uint8_t id) +{ + return id < RTE_EVENT_CRYPTO_ADAPTER_MAX_INSTANCE; +} + +static int +eca_init(void) +{ + const char *name = "crypto_adapter_array"; + const struct rte_memzone *mz; + unsigned int sz; + + sz = sizeof(*event_crypto_adapter) * + RTE_EVENT_CRYPTO_ADAPTER_MAX_INSTANCE; + sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE); + + mz = rte_memzone_lookup(name); + if (mz == NULL) { + mz = rte_memzone_reserve_aligned(name, sz, rte_socket_id(), 0, + RTE_CACHE_LINE_SIZE); + if (mz == NULL) { + RTE_EDEV_LOG_ERR("failed to reserve memzone err = %" + PRId32, rte_errno); + return -rte_errno; + } + } + + event_crypto_adapter = mz->addr; + return 0; +} + +static inline struct rte_event_crypto_adapter * +eca_id_to_adapter(uint8_t id) +{ + return event_crypto_adapter ? + event_crypto_adapter[id] : NULL; +} + +static int +eca_default_config_cb(uint8_t id, uint8_t dev_id, + struct rte_event_crypto_adapter_conf *conf, void *arg) +{ + struct rte_event_dev_config dev_conf; + struct rte_eventdev *dev; + uint8_t port_id; + int started; + int ret; + struct rte_event_port_conf *port_conf = arg; + struct rte_event_crypto_adapter *adapter = eca_id_to_adapter(id); + + dev = &rte_eventdevs[adapter->eventdev_id]; + dev_conf = dev->data->dev_conf; + + started = dev->data->dev_started; + if (started) + rte_event_dev_stop(dev_id); + port_id = dev_conf.nb_event_ports; + dev_conf.nb_event_ports += 1; + ret = rte_event_dev_configure(dev_id, &dev_conf); + if (ret) { + RTE_EDEV_LOG_ERR("failed to configure event dev %u\n", dev_id); + if (started) { + if (rte_event_dev_start(dev_id)) + return -EIO; + } + return ret; + } + + ret = rte_event_port_setup(dev_id, port_id, port_conf); + if (ret) { + RTE_EDEV_LOG_ERR("failed to setup event port %u\n", port_id); + return ret; + } + + conf->event_port_id = port_id; + conf->max_nb = DEFAULT_MAX_NB; + if (started) + ret = rte_event_dev_start(dev_id); + + adapter->default_cb_arg = 1; + return ret; +} + +int __rte_experimental +rte_event_crypto_adapter_create_ext(uint8_t id, uint8_t dev_id, + rte_event_crypto_adapter_conf_cb conf_cb, + enum rte_event_crypto_adapter_mode mode, + void *conf_arg) +{ + struct rte_event_crypto_adapter *adapter; + char mem_name[CRYPTO_ADAPTER_NAME_LEN]; + struct rte_event_dev_info dev_info; + int socket_id; + uint8_t i; + int ret; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + if (conf_cb == NULL) + return -EINVAL; + + if (event_crypto_adapter == NULL) { + ret = eca_init(); + if (ret) + return ret; + } + + adapter = eca_id_to_adapter(id); + if (adapter != NULL) { + RTE_EDEV_LOG_ERR("Crypto adapter id %u already exists!", id); + return -EEXIST; + } + + socket_id = rte_event_dev_socket_id(dev_id); + snprintf(mem_name, CRYPTO_ADAPTER_MEM_NAME_LEN, + "rte_event_crypto_adapter_%d", id); + + adapter = rte_zmalloc_socket(mem_name, sizeof(*adapter), + RTE_CACHE_LINE_SIZE, socket_id); + if (adapter == NULL) { + RTE_EDEV_LOG_ERR("Failed to get mem for event crypto adapter!"); + return -ENOMEM; + } + + ret = rte_event_dev_info_get(dev_id, &dev_info); + if (ret < 0) { + RTE_EDEV_LOG_ERR("Failed to get info for eventdev %d: %s!", + dev_id, dev_info.driver_name); + return ret; + } + + adapter->implicit_release_disabled = (dev_info.event_dev_cap & + RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE); + adapter->eventdev_id = dev_id; + adapter->socket_id = socket_id; + adapter->conf_cb = conf_cb; + adapter->conf_arg = conf_arg; + adapter->mode = mode; + strcpy(adapter->mem_name, mem_name); + adapter->cdevs = rte_zmalloc_socket(adapter->mem_name, + rte_cryptodev_count() * + sizeof(struct crypto_device_info), 0, + socket_id); + if (adapter->cdevs == NULL) { + RTE_EDEV_LOG_ERR("Failed to get mem for crypto devices\n"); + rte_free(adapter); + return -ENOMEM; + } + + rte_spinlock_init(&adapter->lock); + for (i = 0; i < rte_cryptodev_count(); i++) + adapter->cdevs[i].dev = rte_cryptodev_pmd_get_dev(i); + + event_crypto_adapter[id] = adapter; + + return 0; +} + + +int __rte_experimental +rte_event_crypto_adapter_create(uint8_t id, uint8_t dev_id, + struct rte_event_port_conf *port_config, + enum rte_event_crypto_adapter_mode mode) +{ + struct rte_event_port_conf *pc; + int ret; + + if (port_config == NULL) + return -EINVAL; + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + pc = rte_malloc(NULL, sizeof(*pc), 0); + if (pc == NULL) + return -ENOMEM; + *pc = *port_config; + ret = rte_event_crypto_adapter_create_ext(id, dev_id, + eca_default_config_cb, + mode, + pc); + if (ret) + rte_free(pc); + + return ret; +} + +int __rte_experimental +rte_event_crypto_adapter_free(uint8_t id) +{ + struct rte_event_crypto_adapter *adapter; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + adapter = eca_id_to_adapter(id); + if (adapter == NULL) + return -EINVAL; + + if (adapter->nb_qps) { + RTE_EDEV_LOG_ERR("%" PRIu16 "Queue pairs not deleted", + adapter->nb_qps); + return -EBUSY; + } + + if (adapter->default_cb_arg) + rte_free(adapter->conf_arg); + rte_free(adapter->cdevs); + rte_free(adapter); + event_crypto_adapter[id] = NULL; + + return 0; +} + +static inline unsigned int +eca_enq_to_cryptodev(struct rte_event_crypto_adapter *adapter, + struct rte_event *ev, unsigned int cnt) +{ + struct rte_event_crypto_adapter_stats *stats = &adapter->crypto_stats; + union rte_event_crypto_metadata *m_data = NULL; + struct crypto_queue_pair_info *qp_info = NULL; + struct rte_crypto_op *crypto_op; + unsigned int i, n; + uint16_t qp_id, len, ret; + uint8_t cdev_id; + + len = 0; + ret = 0; + n = 0; + stats->event_deq_count += cnt; + + for (i = 0; i < cnt; i++) { + crypto_op = ev[i].event_ptr; + if (crypto_op == NULL) + continue; + if (crypto_op->sess_type == RTE_CRYPTO_OP_WITH_SESSION) { + m_data = rte_cryptodev_sym_session_get_user_data( + crypto_op->sym->session); + if (m_data == NULL) { + rte_pktmbuf_free(crypto_op->sym->m_src); + rte_crypto_op_free(crypto_op); + continue; + } + + cdev_id = m_data->request_info.cdev_id; + qp_id = m_data->request_info.queue_pair_id; + qp_info = &adapter->cdevs[cdev_id].qpairs[qp_id]; + if (qp_info == NULL) { + rte_pktmbuf_free(crypto_op->sym->m_src); + rte_crypto_op_free(crypto_op); + continue; + } + len = qp_info->len; + qp_info->op_buffer[len] = crypto_op; + len++; + } else if (crypto_op->sess_type == RTE_CRYPTO_OP_SESSIONLESS && + crypto_op->private_data_offset) { + m_data = (union rte_event_crypto_metadata *) + ((uint8_t *)crypto_op + + crypto_op->private_data_offset); + cdev_id = m_data->request_info.cdev_id; + qp_id = m_data->request_info.queue_pair_id; + qp_info = &adapter->cdevs[cdev_id].qpairs[qp_id]; + if (qp_info == NULL) { + rte_pktmbuf_free(crypto_op->sym->m_src); + rte_crypto_op_free(crypto_op); + continue; + } + len = qp_info->len; + qp_info->op_buffer[len] = crypto_op; + len++; + } else { + rte_pktmbuf_free(crypto_op->sym->m_src); + rte_crypto_op_free(crypto_op); + continue; + } + + if (len == BATCH_SIZE) { + struct rte_crypto_op **op_buffer = qp_info->op_buffer; + ret = rte_cryptodev_enqueue_burst(cdev_id, + qp_id, + op_buffer, + BATCH_SIZE); + + stats->crypto_enq_count += ret; + + while (ret < len) { + struct rte_crypto_op *op; + op = op_buffer[ret++]; + stats->crypto_enq_fail++; + rte_pktmbuf_free(op->sym->m_src); + rte_crypto_op_free(op); + } + + len = 0; + } + + if (qp_info) + qp_info->len = len; + n += ret; + } + + return n; +} + +static unsigned int +eca_crypto_enq_flush(struct rte_event_crypto_adapter *adapter) +{ + struct rte_event_crypto_adapter_stats *stats = &adapter->crypto_stats; + struct crypto_device_info *curr_dev; + struct crypto_queue_pair_info *curr_queue; + struct rte_crypto_op **op_buffer; + struct rte_cryptodev *dev; + uint8_t cdev_id; + uint16_t qp; + uint16_t ret; + uint16_t num_cdev = rte_cryptodev_count(); + + ret = 0; + for (cdev_id = 0; cdev_id < num_cdev; cdev_id++) { + curr_dev = &adapter->cdevs[cdev_id]; + if (curr_dev == NULL) + continue; + dev = curr_dev->dev; + + for (qp = 0; qp < dev->data->nb_queue_pairs; qp++) { + + curr_queue = &curr_dev->qpairs[qp]; + if (!curr_queue->qp_enabled) + continue; + + op_buffer = curr_queue->op_buffer; + ret = rte_cryptodev_enqueue_burst(cdev_id, + qp, + op_buffer, + curr_queue->len); + stats->crypto_enq_count += ret; + + while (ret < curr_queue->len) { + struct rte_crypto_op *op; + op = op_buffer[ret++]; + stats->crypto_enq_fail++; + rte_pktmbuf_free(op->sym->m_src); + rte_crypto_op_free(op); + } + curr_queue->len = 0; + } + } + + return ret; +} + +static int +eca_crypto_adapter_enq_run(struct rte_event_crypto_adapter *adapter, + unsigned int max_enq) +{ + struct rte_event_crypto_adapter_stats *stats = &adapter->crypto_stats; + struct rte_event ev[BATCH_SIZE]; + unsigned int nb_enq, nb_enqueued; + uint16_t n; + uint8_t event_dev_id = adapter->eventdev_id; + uint8_t event_port_id = adapter->event_port_id; + + nb_enqueued = 0; + if (adapter->mode == RTE_EVENT_CRYPTO_ADAPTER_OP_NEW) + return 0; + + for (nb_enq = 0; nb_enq < max_enq; nb_enq += n) { + stats->event_poll_count++; + n = rte_event_dequeue_burst(event_dev_id, + event_port_id, ev, BATCH_SIZE, 0); + + if (!n) + break; + + nb_enqueued += eca_enq_to_cryptodev(adapter, ev, n); + } + + if ((++adapter->transmit_loop_count & + (CRYPTO_ENQ_FLUSH_THRESHOLD - 1)) == 0) { + nb_enqueued += eca_crypto_enq_flush(adapter); + } + + return nb_enqueued; +} + +static inline void +eca_ops_enqueue_burst(struct rte_event_crypto_adapter *adapter, + struct rte_crypto_op **ops, uint16_t num) +{ + struct rte_event_crypto_adapter_stats *stats = &adapter->crypto_stats; + union rte_event_crypto_metadata *m_data = NULL; + uint8_t event_dev_id = adapter->eventdev_id; + uint8_t event_port_id = adapter->event_port_id; + struct rte_event events[BATCH_SIZE]; + uint16_t nb_enqueued, nb_ev; + uint8_t retry; + uint8_t i; + + nb_ev = 0; + retry = 0; + nb_enqueued = 0; + num = RTE_MIN(num, BATCH_SIZE); + for (i = 0; i < num; i++) { + struct rte_event *ev = &events[nb_ev++]; + if (ops[i]->sess_type == RTE_CRYPTO_OP_WITH_SESSION) { + m_data = rte_cryptodev_sym_session_get_user_data( + ops[i]->sym->session); + } else if (ops[i]->sess_type == RTE_CRYPTO_OP_SESSIONLESS && + ops[i]->private_data_offset) { + m_data = (union rte_event_crypto_metadata *) + ((uint8_t *)ops[i] + + ops[i]->private_data_offset); + } + + if (unlikely(m_data == NULL)) { + rte_pktmbuf_free(ops[i]->sym->m_src); + rte_crypto_op_free(ops[i]); + continue; + } + + rte_memcpy(ev, &m_data->response_info, sizeof(*ev)); + ev->event_ptr = ops[i]; + ev->event_type = RTE_EVENT_TYPE_CRYPTODEV; + if (adapter->implicit_release_disabled) + ev->op = RTE_EVENT_OP_FORWARD; + else + ev->op = RTE_EVENT_OP_NEW; + } + + do { + nb_enqueued += rte_event_enqueue_burst(event_dev_id, + event_port_id, + &events[nb_enqueued], + nb_ev - nb_enqueued); + } while (retry++ < CRYPTO_ADAPTER_MAX_EV_ENQ_RETRIES && + nb_enqueued < nb_ev); + + /* Free mbufs and rte_crypto_ops for failed events */ + for (i = nb_enqueued; i < nb_ev; i++) { + struct rte_crypto_op *op = events[i].event_ptr; + rte_pktmbuf_free(op->sym->m_src); + rte_crypto_op_free(op); + } + + stats->event_enq_fail_count += nb_ev - nb_enqueued; + stats->event_enq_count += nb_enqueued; + stats->event_enq_retry_count += retry - 1; +} + +static inline unsigned int +eca_crypto_adapter_deq_run(struct rte_event_crypto_adapter *adapter, + unsigned int max_deq) +{ + struct rte_event_crypto_adapter_stats *stats = &adapter->crypto_stats; + struct crypto_device_info *curr_dev; + struct crypto_queue_pair_info *curr_queue; + struct rte_crypto_op *ops[BATCH_SIZE]; + uint16_t n, nb_deq; + struct rte_cryptodev *dev; + uint8_t cdev_id; + uint16_t qp, dev_qps; + bool done; + uint16_t num_cdev = rte_cryptodev_count(); + + nb_deq = 0; + do { + uint16_t queues = 0; + done = true; + + for (cdev_id = adapter->next_cdev_id; + cdev_id < num_cdev; cdev_id++) { + curr_dev = &adapter->cdevs[cdev_id]; + if (curr_dev == NULL) + continue; + dev = curr_dev->dev; + dev_qps = dev->data->nb_queue_pairs; + + for (qp = curr_dev->next_queue_pair_id; + queues < dev_qps; qp = (qp + 1) % dev_qps, + queues++) { + + curr_queue = &curr_dev->qpairs[qp]; + if (!curr_queue->qp_enabled) + continue; + + n = rte_cryptodev_dequeue_burst(cdev_id, qp, + ops, BATCH_SIZE); + if (!n) + continue; + + done = false; + stats->crypto_deq_count += n; + eca_ops_enqueue_burst(adapter, ops, n); + nb_deq += n; + + if (nb_deq > max_deq) { + if ((qp + 1) == dev_qps) { + adapter->next_cdev_id = + (cdev_id + 1) + % num_cdev; + } + curr_dev->next_queue_pair_id = (qp + 1) + % dev->data->nb_queue_pairs; + + return nb_deq; + } + } + } + } while (done == false); + return nb_deq; +} + +static void +eca_crypto_adapter_run(struct rte_event_crypto_adapter *adapter, + unsigned int max_ops) +{ + while (max_ops) { + unsigned int e_cnt, d_cnt; + + e_cnt = eca_crypto_adapter_deq_run(adapter, max_ops); + max_ops -= RTE_MIN(max_ops, e_cnt); + + d_cnt = eca_crypto_adapter_enq_run(adapter, max_ops); + max_ops -= RTE_MIN(max_ops, d_cnt); + + if (e_cnt == 0 && d_cnt == 0) + break; + + } +} + +static int +eca_service_func(void *args) +{ + struct rte_event_crypto_adapter *adapter = args; + + if (rte_spinlock_trylock(&adapter->lock) == 0) + return 0; + eca_crypto_adapter_run(adapter, adapter->max_nb); + rte_spinlock_unlock(&adapter->lock); + + return 0; +} + +static int +eca_init_service(struct rte_event_crypto_adapter *adapter, uint8_t id) +{ + struct rte_event_crypto_adapter_conf adapter_conf; + struct rte_service_spec service; + int ret; + + if (adapter->service_inited) + return 0; + + memset(&service, 0, sizeof(service)); + snprintf(service.name, CRYPTO_ADAPTER_NAME_LEN, + "rte_event_crypto_adapter_%d", id); + service.socket_id = adapter->socket_id; + service.callback = eca_service_func; + service.callback_userdata = adapter; + /* Service function handles locking for queue add/del updates */ + service.capabilities = RTE_SERVICE_CAP_MT_SAFE; + ret = rte_service_component_register(&service, &adapter->service_id); + if (ret) { + RTE_EDEV_LOG_ERR("failed to register service %s err = %" PRId32, + service.name, ret); + return ret; + } + + ret = adapter->conf_cb(id, adapter->eventdev_id, + &adapter_conf, adapter->conf_arg); + if (ret) { + RTE_EDEV_LOG_ERR("configuration callback failed err = %" PRId32, + ret); + return ret; + } + + adapter->max_nb = adapter_conf.max_nb; + adapter->event_port_id = adapter_conf.event_port_id; + adapter->service_inited = 1; + + return ret; +} + +static void +eca_update_qp_info(struct rte_event_crypto_adapter *adapter, + struct crypto_device_info *dev_info, + int32_t queue_pair_id, + uint8_t add) +{ + struct crypto_queue_pair_info *qp_info; + int enabled; + uint16_t i; + + if (dev_info->qpairs == NULL) + return; + + if (queue_pair_id == -1) { + for (i = 0; i < dev_info->dev->data->nb_queue_pairs; i++) + eca_update_qp_info(adapter, dev_info, i, add); + } else { + qp_info = &dev_info->qpairs[queue_pair_id]; + enabled = qp_info->qp_enabled; + if (add) { + adapter->nb_qps += !enabled; + dev_info->num_qpairs += !enabled; + } else { + adapter->nb_qps -= enabled; + dev_info->num_qpairs -= enabled; + } + qp_info->qp_enabled = !!add; + } +} + +static int +eca_add_queue_pair(struct rte_event_crypto_adapter *adapter, + uint8_t cdev_id, + int queue_pair_id) +{ + struct crypto_device_info *dev_info = &adapter->cdevs[cdev_id]; + struct crypto_queue_pair_info *qpairs; + uint32_t i; + + if (dev_info->qpairs == NULL) { + dev_info->qpairs = + rte_zmalloc_socket(adapter->mem_name, + dev_info->dev->data->nb_queue_pairs * + sizeof(struct crypto_queue_pair_info), + 0, adapter->socket_id); + if (dev_info->qpairs == NULL) + return -ENOMEM; + + qpairs = dev_info->qpairs; + qpairs->op_buffer = rte_zmalloc_socket(adapter->mem_name, + BATCH_SIZE * + sizeof(struct rte_crypto_op *), + 0, adapter->socket_id); + if (!qpairs->op_buffer) { + rte_free(qpairs); + return -ENOMEM; + } + } + + if (queue_pair_id == -1) { + for (i = 0; i < dev_info->dev->data->nb_queue_pairs; i++) + eca_update_qp_info(adapter, dev_info, i, 1); + } else + eca_update_qp_info(adapter, dev_info, + (uint16_t)queue_pair_id, 1); + + return 0; +} + +int __rte_experimental +rte_event_crypto_adapter_queue_pair_add(uint8_t id, + uint8_t cdev_id, + int32_t queue_pair_id, + const struct rte_event *event) +{ + struct rte_event_crypto_adapter *adapter; + struct rte_eventdev *dev; + struct crypto_device_info *dev_info; + uint32_t cap; + int ret; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + if (!rte_cryptodev_pmd_is_valid_dev(cdev_id)) { + RTE_EDEV_LOG_ERR("Invalid dev_id=%" PRIu8, cdev_id); + return -EINVAL; + } + + adapter = eca_id_to_adapter(id); + if (adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[adapter->eventdev_id]; + ret = rte_event_crypto_adapter_caps_get(adapter->eventdev_id, + cdev_id, + &cap); + if (ret) { + RTE_EDEV_LOG_ERR("Failed to get adapter caps dev %" PRIu8 + " cdev %" PRIu8, id, cdev_id); + return ret; + } + + if ((cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) && + (event == NULL)) { + RTE_EDEV_LOG_ERR("Conf value can not be NULL for dev_id=%u", + cdev_id); + return -EINVAL; + } + + dev_info = &adapter->cdevs[cdev_id]; + + if (queue_pair_id != -1 && + (uint16_t)queue_pair_id >= dev_info->dev->data->nb_queue_pairs) { + RTE_EDEV_LOG_ERR("Invalid queue_pair_id %" PRIu16, + (uint16_t)queue_pair_id); + return -EINVAL; + } + + /* In case HW cap is RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD, + * no need of service core as HW supports event forward capability. + */ + if ((cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD) || + (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND && + adapter->mode == RTE_EVENT_CRYPTO_ADAPTER_OP_NEW) || + (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW && + adapter->mode == RTE_EVENT_CRYPTO_ADAPTER_OP_NEW)) { + RTE_FUNC_PTR_OR_ERR_RET( + *dev->dev_ops->crypto_adapter_queue_pair_add, + -ENOTSUP); + if (dev_info->qpairs == NULL) { + dev_info->qpairs = + rte_zmalloc_socket(adapter->mem_name, + dev_info->dev->data->nb_queue_pairs * + sizeof(struct crypto_queue_pair_info), + 0, adapter->socket_id); + if (dev_info->qpairs == NULL) + return -ENOMEM; + } + + ret = (*dev->dev_ops->crypto_adapter_queue_pair_add)(dev, + dev_info->dev, + queue_pair_id, + event); + if (ret) + return ret; + + else + eca_update_qp_info(adapter, &adapter->cdevs[cdev_id], + queue_pair_id, 1); + } + + /* In case HW cap is RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW, + * or SW adapter, initiate services so the application can choose + * which ever way it wants to use the adapter. + * Case 1: RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW + * Application may wants to use one of below two mode + * a. OP_FORWARD mode -> HW Dequeue + SW enqueue + * b. OP_NEW mode -> HW Dequeue + * Case 2: No HW caps, use SW adapter + * a. OP_FORWARD mode -> SW enqueue & dequeue + * b. OP_NEW mode -> SW Dequeue + */ + if ((cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW && + adapter->mode == RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD) || + (!(cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW) && + !(cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD) && + !(cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND) && + (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_SESSION_PRIVATE_DATA))) { + rte_spinlock_lock(&adapter->lock); + ret = eca_init_service(adapter, id); + if (ret == 0) + ret = eca_add_queue_pair(adapter, cdev_id, + queue_pair_id); + rte_spinlock_unlock(&adapter->lock); + + if (ret) + return ret; + + rte_service_component_runstate_set(adapter->service_id, 1); + } + + return 0; +} + +int __rte_experimental +rte_event_crypto_adapter_queue_pair_del(uint8_t id, uint8_t cdev_id, + int32_t queue_pair_id) +{ + struct rte_event_crypto_adapter *adapter; + struct crypto_device_info *dev_info; + struct rte_eventdev *dev; + int ret; + uint32_t cap; + uint16_t i; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + if (!rte_cryptodev_pmd_is_valid_dev(cdev_id)) { + RTE_EDEV_LOG_ERR("Invalid dev_id=%" PRIu8, cdev_id); + return -EINVAL; + } + + adapter = eca_id_to_adapter(id); + if (adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[adapter->eventdev_id]; + ret = rte_event_crypto_adapter_caps_get(adapter->eventdev_id, + cdev_id, + &cap); + if (ret) + return ret; + + dev_info = &adapter->cdevs[cdev_id]; + + if (queue_pair_id != -1 && + (uint16_t)queue_pair_id >= dev_info->dev->data->nb_queue_pairs) { + RTE_EDEV_LOG_ERR("Invalid queue_pair_id %" PRIu16, + (uint16_t)queue_pair_id); + return -EINVAL; + } + + if ((cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD) || + (cap & RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW && + adapter->mode == RTE_EVENT_CRYPTO_ADAPTER_OP_NEW)) { + RTE_FUNC_PTR_OR_ERR_RET( + *dev->dev_ops->crypto_adapter_queue_pair_del, + -ENOTSUP); + ret = (*dev->dev_ops->crypto_adapter_queue_pair_del)(dev, + dev_info->dev, + queue_pair_id); + if (ret == 0) { + eca_update_qp_info(adapter, + &adapter->cdevs[cdev_id], + queue_pair_id, + 0); + if (dev_info->num_qpairs == 0) { + rte_free(dev_info->qpairs); + dev_info->qpairs = NULL; + } + } + } else { + if (adapter->nb_qps == 0) + return 0; + + rte_spinlock_lock(&adapter->lock); + if (queue_pair_id == -1) { + for (i = 0; i < dev_info->dev->data->nb_queue_pairs; + i++) + eca_update_qp_info(adapter, dev_info, + queue_pair_id, 0); + } else { + eca_update_qp_info(adapter, dev_info, + (uint16_t)queue_pair_id, 0); + } + + if (dev_info->num_qpairs == 0) { + rte_free(dev_info->qpairs); + dev_info->qpairs = NULL; + } + + rte_spinlock_unlock(&adapter->lock); + rte_service_component_runstate_set(adapter->service_id, + adapter->nb_qps); + } + + return ret; +} + +static int +eca_adapter_ctrl(uint8_t id, int start) +{ + struct rte_event_crypto_adapter *adapter; + struct crypto_device_info *dev_info; + struct rte_eventdev *dev; + uint32_t i; + int use_service; + int stop = !start; + + use_service = 0; + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + adapter = eca_id_to_adapter(id); + if (adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[adapter->eventdev_id]; + + for (i = 0; i < rte_cryptodev_count(); i++) { + dev_info = &adapter->cdevs[i]; + /* if start check for num queue pairs */ + if (start && !dev_info->num_qpairs) + continue; + /* if stop check if dev has been started */ + if (stop && !dev_info->dev_started) + continue; + use_service |= !dev_info->internal_event_port; + dev_info->dev_started = start; + if (dev_info->internal_event_port == 0) + continue; + start ? (*dev->dev_ops->crypto_adapter_start)(dev, + &dev_info->dev[i]) : + (*dev->dev_ops->crypto_adapter_stop)(dev, + &dev_info->dev[i]); + } + + if (use_service) + rte_service_runstate_set(adapter->service_id, start); + + return 0; +} + +int __rte_experimental +rte_event_crypto_adapter_start(uint8_t id) +{ + struct rte_event_crypto_adapter *adapter; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + adapter = eca_id_to_adapter(id); + if (adapter == NULL) + return -EINVAL; + + return eca_adapter_ctrl(id, 1); +} + +int __rte_experimental +rte_event_crypto_adapter_stop(uint8_t id) +{ + return eca_adapter_ctrl(id, 0); +} + +int __rte_experimental +rte_event_crypto_adapter_stats_get(uint8_t id, + struct rte_event_crypto_adapter_stats *stats) +{ + struct rte_event_crypto_adapter *adapter; + struct rte_event_crypto_adapter_stats dev_stats_sum = { 0 }; + struct rte_event_crypto_adapter_stats dev_stats; + struct rte_eventdev *dev; + struct crypto_device_info *dev_info; + uint32_t i; + int ret; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + adapter = eca_id_to_adapter(id); + if (adapter == NULL || stats == NULL) + return -EINVAL; + + dev = &rte_eventdevs[adapter->eventdev_id]; + memset(stats, 0, sizeof(*stats)); + for (i = 0; i < rte_cryptodev_count(); i++) { + dev_info = &adapter->cdevs[i]; + if (dev_info->internal_event_port == 0 || + dev->dev_ops->crypto_adapter_stats_get == NULL) + continue; + ret = (*dev->dev_ops->crypto_adapter_stats_get)(dev, + dev_info->dev, + &dev_stats); + if (ret) + continue; + + dev_stats_sum.crypto_deq_count += dev_stats.crypto_deq_count; + dev_stats_sum.event_enq_count += + dev_stats.event_enq_count; + } + + if (adapter->service_inited) + *stats = adapter->crypto_stats; + + stats->crypto_deq_count += dev_stats_sum.crypto_deq_count; + stats->event_enq_count += dev_stats_sum.event_enq_count; + + return 0; +} + +int __rte_experimental +rte_event_crypto_adapter_stats_reset(uint8_t id) +{ + struct rte_event_crypto_adapter *adapter; + struct crypto_device_info *dev_info; + struct rte_eventdev *dev; + uint32_t i; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + adapter = eca_id_to_adapter(id); + if (adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[adapter->eventdev_id]; + for (i = 0; i < rte_cryptodev_count(); i++) { + dev_info = &adapter->cdevs[i]; + if (dev_info->internal_event_port == 0 || + dev->dev_ops->crypto_adapter_stats_reset == NULL) + continue; + (*dev->dev_ops->crypto_adapter_stats_reset)(dev, + dev_info->dev); + } + + memset(&adapter->crypto_stats, 0, sizeof(adapter->crypto_stats)); + return 0; +} + +int __rte_experimental +rte_event_crypto_adapter_service_id_get(uint8_t id, uint32_t *service_id) +{ + struct rte_event_crypto_adapter *adapter; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + adapter = eca_id_to_adapter(id); + if (adapter == NULL || service_id == NULL) + return -EINVAL; + + if (adapter->service_inited) + *service_id = adapter->service_id; + + return adapter->service_inited ? 0 : -ESRCH; +} + +int __rte_experimental +rte_event_crypto_adapter_event_port_get(uint8_t id, uint8_t *event_port_id) +{ + struct rte_event_crypto_adapter *adapter; + + EVENT_CRYPTO_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + adapter = eca_id_to_adapter(id); + if (adapter == NULL || event_port_id == NULL) + return -EINVAL; + + *event_port_id = adapter->event_port_id; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.h b/src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.h new file mode 100644 index 00000000..d367309c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_crypto_adapter.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + * All rights reserved. + */ + +#ifndef _RTE_EVENT_CRYPTO_ADAPTER_ +#define _RTE_EVENT_CRYPTO_ADAPTER_ + +/** + * @file + * + * RTE Event crypto adapter + * + * Eventdev library provides couple of adapters to bridge between various + * components for providing new event source. The event crypto adapter is + * one of those adapters which is intended to bridge between event devices + * and crypto devices. + * + * The crypto adapter adds support to enqueue/dequeue crypto operations to/ + * from event device. The packet flow between crypto device and the event + * device can be accomplished using both SW and HW based transfer mechanisms. + * The adapter uses an EAL service core function for SW based packet transfer + * and uses the eventdev PMD functions to configure HW based packet transfer + * between the crypto device and the event device. + * + * The application can choose to submit a crypto operation directly to + * crypto device or send it to the crypto adapter via eventdev based on + * RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD capability. + * The first mode is known as the event new(RTE_EVENT_CRYPTO_ADAPTER_OP_NEW) + * mode and the second as the event forward(RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD) + * mode. The choice of mode can be specified while creating the adapter. + * In the former mode, it is an application responsibility to enable ingress + * packet ordering. In the latter mode, it is the adapter responsibility to + * enable the ingress packet ordering. + * + * + * Working model of RTE_EVENT_CRYPTO_ADAPTER_OP_NEW mode: + * + * +--------------+ +--------------+ + * | | | Crypto stage | + * | Application |---[2]-->| + enqueue to | + * | | | cryptodev | + * +--------------+ +--------------+ + * ^ ^ | + * | | [3] + * [6] [1] | + * | | | + * +--------------+ | + * | | | + * | Event device | | + * | | | + * +--------------+ | + * ^ | + * | | + * [5] | + * | v + * +--------------+ +--------------+ + * | | | | + * |Crypto adapter|<--[4]---| Cryptodev | + * | | | | + * +--------------+ +--------------+ + * + * + * [1] Application dequeues events from the previous stage. + * [2] Application prepares the crypto operations. + * [3] Crypto operations are submitted to cryptodev by application. + * [4] Crypto adapter dequeues crypto completions from cryptodev. + * [5] Crypto adapter enqueues events to the eventdev. + * [6] Application dequeues from eventdev and prepare for further + * processing. + * + * In the RTE_EVENT_CRYPTO_ADAPTER_OP_NEW mode, application submits crypto + * operations directly to crypto device. The adapter then dequeues crypto + * completions from crypto device and enqueue events to the event device. + * This mode does not ensure ingress ordering, if the application directly + * enqueues to cryptodev without going through crypto/atomic stage i.e. + * removing item [1] and [2]. + * Events dequeued from the adapter will be treated as new events. + * In this mode, application needs to specify event information (response + * information) which is needed to enqueue an event after the crypto operation + * is completed. + * + * + * Working model of RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode: + * + * +--------------+ +--------------+ + * --[1]-->| |---[2]-->| Application | + * | Event device | | in | + * <--[8]--| |<--[3]---| Ordered stage| + * +--------------+ +--------------+ + * ^ | + * | [4] + * [7] | + * | v + * +----------------+ +--------------+ + * | |--[5]->| | + * | Crypto adapter | | Cryptodev | + * | |<-[6]--| | + * +----------------+ +--------------+ + * + * + * [1] Events from the previous stage. + * [2] Application in ordered stage dequeues events from eventdev. + * [3] Application enqueues crypto operations as events to eventdev. + * [4] Crypto adapter dequeues event from eventdev. + * [5] Crypto adapter submits crypto operations to cryptodev + * (Atomic stage). + * [6] Crypto adapter dequeues crypto completions from cryptodev + * [7] Crypto adapter enqueues events to the eventdev + * [8] Events to the next stage + * + * In the RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode, if HW supports + * RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD capability the application + * can directly submit the crypto operations to the cryptodev. + * If not, application retrieves crypto adapter's event port using + * rte_event_crypto_adapter_event_port_get() API. Then, links its event + * queue to this port and starts enqueuing crypto operations as events + * to the eventdev. The adapter then dequeues the events and submits the + * crypto operations to the cryptodev. After the crypto completions, the + * adapter enqueues events to the event device. + * Application can use this mode, when ingress packet ordering is needed. + * Events dequeued from the adapter will be treated as forwarded events. + * In this mode, the application needs to specify the cryptodev ID + * and queue pair ID (request information) needed to enqueue a crypto + * operation in addition to the event information (response information) + * needed to enqueue an event after the crypto operation has completed. + * + * + * The event crypto adapter provides common APIs to configure the packet flow + * from the crypto device to event devices for both SW and HW based transfers. + * The crypto event adapter's functions are: + * - rte_event_crypto_adapter_create_ext() + * - rte_event_crypto_adapter_create() + * - rte_event_crypto_adapter_free() + * - rte_event_crypto_adapter_queue_pair_add() + * - rte_event_crypto_adapter_queue_pair_del() + * - rte_event_crypto_adapter_start() + * - rte_event_crypto_adapter_stop() + * - rte_event_crypto_adapter_stats_get() + * - rte_event_crypto_adapter_stats_reset() + + * The applicaton creates an instance using rte_event_crypto_adapter_create() + * or rte_event_crypto_adapter_create_ext(). + * + * Cryptodev queue pair addition/deletion is done using the + * rte_event_crypto_adapter_queue_pair_xxx() APIs. If HW supports + * RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND capability, event + * information must be passed to the add API. + * + * The SW adapter or HW PMD uses rte_crypto_op::sess_type to decide whether + * request/response(private) data is located in the crypto/security session + * or at an offset in the rte_crypto_op. + * + * For session-based operations, the set and get API provides a mechanism for + * an application to store and retrieve the data information stored + * along with the crypto session. + * The RTE_EVENT_CRYPTO_ADAPTER_CAP_SESSION_PRIVATE_DATA capability indicates + * whether HW or SW supports this feature. + * + * For session-less mode, the adapter gets the private data information placed + * along with the ``struct rte_crypto_op``. + * The rte_crypto_op::private_data_offset provides an offset to locate the + * request/response information in the rte_crypto_op. This offset is counted + * from the start of the rte_crypto_op including initialization vector (IV). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "rte_eventdev.h" + +/** + * @warning + * @b EXPERIMENTAL: this enum may change without prior notice + * + * Crypto event adapter mode + */ +enum rte_event_crypto_adapter_mode { + RTE_EVENT_CRYPTO_ADAPTER_OP_NEW, + /**< Start the crypto adapter in event new mode. + * @see RTE_EVENT_OP_NEW. + * Application submits crypto operations to the cryptodev. + * Adapter only dequeues the crypto completions from cryptodev + * and enqueue events to the eventdev. + */ + RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD, + /**< Start the crypto adapter in event forward mode. + * @see RTE_EVENT_OP_FORWARD. + * Application submits crypto requests as events to the crypto + * adapter or crypto device based on + * RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD capability. + * Crypto completions are enqueued back to the eventdev by + * crypto adapter. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Crypto event request structure will be filled by application to + * provide event request information to the adapter. + */ +struct rte_event_crypto_request { + uint8_t resv[8]; + /**< Overlaps with first 8 bytes of struct rte_event + * that encode the response event information. Application + * is expected to fill in struct rte_event response_info. + */ + uint16_t cdev_id; + /**< cryptodev ID to be used */ + uint16_t queue_pair_id; + /**< cryptodev queue pair ID to be used */ + uint32_t resv1; + /**< Reserved bits */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Crypto event metadata structure will be filled by application + * to provide crypto request and event response information. + * + * If crypto events are enqueued using a HW mechanism, the cryptodev + * PMD will use the event response information to set up the event + * that is enqueued back to eventdev after completion of the crypto + * operation. If the transfer is done by SW, event response information + * will be used by the adapter. + */ +union rte_event_crypto_metadata { + struct rte_event_crypto_request request_info; + /**< Request information to be filled in by application + * for RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode. + */ + struct rte_event response_info; + /**< Response information to be filled in by application + * for RTE_EVENT_CRYPTO_ADAPTER_OP_NEW and + * RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Adapter configuration structure that the adapter configuration callback + * function is expected to fill out + * @see rte_event_crypto_adapter_conf_cb + */ +struct rte_event_crypto_adapter_conf { + uint8_t event_port_id; + /**< Event port identifier, the adapter enqueues events to this + * port and dequeues crypto request events in + * RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode. + */ + uint32_t max_nb; + /**< The adapter can return early if it has processed at least + * max_nb crypto ops. This isn't treated as a requirement; batching + * may cause the adapter to process more than max_nb crypto ops. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Function type used for adapter configuration callback. The callback is + * used to fill in members of the struct rte_event_crypto_adapter_conf, this + * callback is invoked when creating a SW service for packet transfer from + * cryptodev queue pair to the event device. The SW service is created within + * the rte_event_crypto_adapter_queue_pair_add() function if SW based packet + * transfers from cryptodev queue pair to the event device are required. + * + * @param id + * Adapter identifier. + * + * @param dev_id + * Event device identifier. + * + * @param conf + * Structure that needs to be populated by this callback. + * + * @param arg + * Argument to the callback. This is the same as the conf_arg passed to the + * rte_event_crypto_adapter_create_ext(). + */ +typedef int (*rte_event_crypto_adapter_conf_cb) (uint8_t id, uint8_t dev_id, + struct rte_event_crypto_adapter_conf *conf, + void *arg); + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * A structure used to retrieve statistics for an event crypto adapter + * instance. + */ + +struct rte_event_crypto_adapter_stats { + uint64_t event_poll_count; + /**< Event port poll count */ + uint64_t event_deq_count; + /**< Event dequeue count */ + uint64_t crypto_enq_count; + /**< Cryptodev enqueue count */ + uint64_t crypto_enq_fail; + /**< Cryptodev enqueue failed count */ + uint64_t crypto_deq_count; + /**< Cryptodev dequeue count */ + uint64_t event_enq_count; + /**< Event enqueue count */ + uint64_t event_enq_retry_count; + /**< Event enqueue retry count */ + uint64_t event_enq_fail_count; + /**< Event enqueue fail count */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a new event crypto adapter with the specified identifier. + * + * @param id + * Adapter identifier. + * + * @param dev_id + * Event device identifier. + * + * @param conf_cb + * Callback function that fills in members of a + * struct rte_event_crypto_adapter_conf struct passed into + * it. + * + * @param mode + * Flag to indicate the mode of the adapter. + * @see rte_event_crypto_adapter_mode + * + * @param conf_arg + * Argument that is passed to the conf_cb function. + * + * @return + * - 0: Success + * - <0: Error code on failure + */ +int __rte_experimental +rte_event_crypto_adapter_create_ext(uint8_t id, uint8_t dev_id, + rte_event_crypto_adapter_conf_cb conf_cb, + enum rte_event_crypto_adapter_mode mode, + void *conf_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a new event crypto adapter with the specified identifier. + * This function uses an internal configuration function that creates an event + * port. This default function reconfigures the event device with an + * additional event port and set up the event port using the port_config + * parameter passed into this function. In case the application needs more + * control in configuration of the service, it should use the + * rte_event_crypto_adapter_create_ext() version. + * + * @param id + * Adapter identifier. + * + * @param dev_id + * Event device identifier. + * + * @param port_config + * Argument of type *rte_event_port_conf* that is passed to the conf_cb + * function. + * + * @param mode + * Flag to indicate the mode of the adapter. + * @see rte_event_crypto_adapter_mode + * + * @return + * - 0: Success + * - <0: Error code on failure + */ +int __rte_experimental +rte_event_crypto_adapter_create(uint8_t id, uint8_t dev_id, + struct rte_event_port_conf *port_config, + enum rte_event_crypto_adapter_mode mode); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Free an event crypto adapter + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success + * - <0: Error code on failure, If the adapter still has queue pairs + * added to it, the function returns -EBUSY. + */ +int __rte_experimental +rte_event_crypto_adapter_free(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Add a queue pair to an event crypto adapter. + * + * @param id + * Adapter identifier. + * + * @param cdev_id + * Cryptodev identifier. + * + * @param queue_pair_id + * Cryptodev queue pair identifier. If queue_pair_id is set -1, + * adapter adds all the pre configured queue pairs to the instance. + * + * @param event + * if HW supports cryptodev queue pair to event queue binding, application is + * expected to fill in event information, else it will be NULL. + * @see RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND + * + * @return + * - 0: Success, queue pair added correctly. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_queue_pair_add(uint8_t id, + uint8_t cdev_id, + int32_t queue_pair_id, + const struct rte_event *event); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Delete a queue pair from an event crypto adapter. + * + * @param id + * Adapter identifier. + * + * @param cdev_id + * Cryptodev identifier. + * + * @param queue_pair_id + * Cryptodev queue pair identifier. + * + * @return + * - 0: Success, queue pair deleted successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_queue_pair_del(uint8_t id, uint8_t cdev_id, + int32_t queue_pair_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start event crypto adapter + * + * @param id + * Adapter identifier. + * + * + * @return + * - 0: Success, adapter started successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_start(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop event crypto adapter + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success, adapter stopped successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_stop(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve statistics for an adapter + * + * @param id + * Adapter identifier. + * + * @param [out] stats + * A pointer to structure used to retrieve statistics for an adapter. + * + * @return + * - 0: Success, retrieved successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_stats_get(uint8_t id, + struct rte_event_crypto_adapter_stats *stats); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset statistics for an adapter. + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success, statistics reset successfully. + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_stats_reset(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the service ID of an adapter. If the adapter doesn't use + * a rte_service function, this function returns -ESRCH. + * + * @param id + * Adapter identifier. + * + * @param [out] service_id + * A pointer to a uint32_t, to be filled in with the service id. + * + * @return + * - 0: Success + * - <0: Error code on failure, if the adapter doesn't use a rte_service + * function, this function returns -ESRCH. + */ +int __rte_experimental +rte_event_crypto_adapter_service_id_get(uint8_t id, uint32_t *service_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the event port of an adapter. + * + * @param id + * Adapter identifier. + * + * @param [out] event_port_id + * Application links its event queue to this adapter port which is used + * in RTE_EVENT_CRYPTO_ADAPTER_OP_FORWARD mode. + * + * @return + * - 0: Success + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_crypto_adapter_event_port_get(uint8_t id, uint8_t *event_port_id); + +#ifdef __cplusplus +} +#endif +#endif /* _RTE_EVENT_CRYPTO_ADAPTER_ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.c b/src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.c new file mode 100644 index 00000000..f5e5a0b5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.c @@ -0,0 +1,2430 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation. + * All rights reserved. + */ +#if defined(LINUX) +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_eventdev.h" +#include "rte_eventdev_pmd.h" +#include "rte_event_eth_rx_adapter.h" + +#define BATCH_SIZE 32 +#define BLOCK_CNT_THRESHOLD 10 +#define ETH_EVENT_BUFFER_SIZE (4*BATCH_SIZE) + +#define ETH_RX_ADAPTER_SERVICE_NAME_LEN 32 +#define ETH_RX_ADAPTER_MEM_NAME_LEN 32 + +#define RSS_KEY_SIZE 40 +/* value written to intr thread pipe to signal thread exit */ +#define ETH_BRIDGE_INTR_THREAD_EXIT 1 +/* Sentinel value to detect initialized file handle */ +#define INIT_FD -1 + +/* + * Used to store port and queue ID of interrupting Rx queue + */ +union queue_data { + RTE_STD_C11 + void *ptr; + struct { + uint16_t port; + uint16_t queue; + }; +}; + +/* + * There is an instance of this struct per polled Rx queue added to the + * adapter + */ +struct eth_rx_poll_entry { + /* Eth port to poll */ + uint16_t eth_dev_id; + /* Eth rx queue to poll */ + uint16_t eth_rx_qid; +}; + +/* Instance per adapter */ +struct rte_eth_event_enqueue_buffer { + /* Count of events in this buffer */ + uint16_t count; + /* Array of events in this buffer */ + struct rte_event events[ETH_EVENT_BUFFER_SIZE]; +}; + +struct rte_event_eth_rx_adapter { + /* RSS key */ + uint8_t rss_key_be[RSS_KEY_SIZE]; + /* Event device identifier */ + uint8_t eventdev_id; + /* Per ethernet device structure */ + struct eth_device_info *eth_devices; + /* Event port identifier */ + uint8_t event_port_id; + /* Lock to serialize config updates with service function */ + rte_spinlock_t rx_lock; + /* Max mbufs processed in any service function invocation */ + uint32_t max_nb_rx; + /* Receive queues that need to be polled */ + struct eth_rx_poll_entry *eth_rx_poll; + /* Size of the eth_rx_poll array */ + uint16_t num_rx_polled; + /* Weighted round robin schedule */ + uint32_t *wrr_sched; + /* wrr_sched[] size */ + uint32_t wrr_len; + /* Next entry in wrr[] to begin polling */ + uint32_t wrr_pos; + /* Event burst buffer */ + struct rte_eth_event_enqueue_buffer event_enqueue_buffer; + /* Per adapter stats */ + struct rte_event_eth_rx_adapter_stats stats; + /* Block count, counts up to BLOCK_CNT_THRESHOLD */ + uint16_t enq_block_count; + /* Block start ts */ + uint64_t rx_enq_block_start_ts; + /* epoll fd used to wait for Rx interrupts */ + int epd; + /* Num of interrupt driven interrupt queues */ + uint32_t num_rx_intr; + /* Used to send of interrupting Rx queues from + * the interrupt thread to the Rx thread + */ + struct rte_ring *intr_ring; + /* Rx Queue data (dev id, queue id) for the last non-empty + * queue polled + */ + union queue_data qd; + /* queue_data is valid */ + int qd_valid; + /* Interrupt ring lock, synchronizes Rx thread + * and interrupt thread + */ + rte_spinlock_t intr_ring_lock; + /* event array passed to rte_poll_wait */ + struct rte_epoll_event *epoll_events; + /* Count of interrupt vectors in use */ + uint32_t num_intr_vec; + /* Thread blocked on Rx interrupts */ + pthread_t rx_intr_thread; + /* Configuration callback for rte_service configuration */ + rte_event_eth_rx_adapter_conf_cb conf_cb; + /* Configuration callback argument */ + void *conf_arg; + /* Set if default_cb is being used */ + int default_cb_arg; + /* Service initialization state */ + uint8_t service_inited; + /* Total count of Rx queues in adapter */ + uint32_t nb_queues; + /* Memory allocation name */ + char mem_name[ETH_RX_ADAPTER_MEM_NAME_LEN]; + /* Socket identifier cached from eventdev */ + int socket_id; + /* Per adapter EAL service */ + uint32_t service_id; + /* Adapter started flag */ + uint8_t rxa_started; + /* Adapter ID */ + uint8_t id; +} __rte_cache_aligned; + +/* Per eth device */ +struct eth_device_info { + struct rte_eth_dev *dev; + struct eth_rx_queue_info *rx_queue; + /* Rx callback */ + rte_event_eth_rx_adapter_cb_fn cb_fn; + /* Rx callback argument */ + void *cb_arg; + /* Set if ethdev->eventdev packet transfer uses a + * hardware mechanism + */ + uint8_t internal_event_port; + /* Set if the adapter is processing rx queues for + * this eth device and packet processing has been + * started, allows for the code to know if the PMD + * rx_adapter_stop callback needs to be invoked + */ + uint8_t dev_rx_started; + /* Number of queues added for this device */ + uint16_t nb_dev_queues; + /* Number of poll based queues + * If nb_rx_poll > 0, the start callback will + * be invoked if not already invoked + */ + uint16_t nb_rx_poll; + /* Number of interrupt based queues + * If nb_rx_intr > 0, the start callback will + * be invoked if not already invoked. + */ + uint16_t nb_rx_intr; + /* Number of queues that use the shared interrupt */ + uint16_t nb_shared_intr; + /* sum(wrr(q)) for all queues within the device + * useful when deleting all device queues + */ + uint32_t wrr_len; + /* Intr based queue index to start polling from, this is used + * if the number of shared interrupts is non-zero + */ + uint16_t next_q_idx; + /* Intr based queue indices */ + uint16_t *intr_queue; + /* device generates per Rx queue interrupt for queue index + * for queue indices < RTE_MAX_RXTX_INTR_VEC_ID - 1 + */ + int multi_intr_cap; + /* shared interrupt enabled */ + int shared_intr_enabled; +}; + +/* Per Rx queue */ +struct eth_rx_queue_info { + int queue_enabled; /* True if added */ + int intr_enabled; + uint16_t wt; /* Polling weight */ + uint8_t event_queue_id; /* Event queue to enqueue packets to */ + uint8_t sched_type; /* Sched type for events */ + uint8_t priority; /* Event priority */ + uint32_t flow_id; /* App provided flow identifier */ + uint32_t flow_id_mask; /* Set to ~0 if app provides flow id else 0 */ +}; + +static struct rte_event_eth_rx_adapter **event_eth_rx_adapter; + +static inline int +rxa_validate_id(uint8_t id) +{ + return id < RTE_EVENT_ETH_RX_ADAPTER_MAX_INSTANCE; +} + +#define RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, retval) do { \ + if (!rxa_validate_id(id)) { \ + RTE_EDEV_LOG_ERR("Invalid eth Rx adapter id = %d\n", id); \ + return retval; \ + } \ +} while (0) + +static inline int +rxa_sw_adapter_queue_count(struct rte_event_eth_rx_adapter *rx_adapter) +{ + return rx_adapter->num_rx_polled + rx_adapter->num_rx_intr; +} + +/* Greatest common divisor */ +static uint16_t rxa_gcd_u16(uint16_t a, uint16_t b) +{ + uint16_t r = a % b; + + return r ? rxa_gcd_u16(b, r) : b; +} + +/* Returns the next queue in the polling sequence + * + * http://kb.linuxvirtualserver.org/wiki/Weighted_Round-Robin_Scheduling + */ +static int +rxa_wrr_next(struct rte_event_eth_rx_adapter *rx_adapter, + unsigned int n, int *cw, + struct eth_rx_poll_entry *eth_rx_poll, uint16_t max_wt, + uint16_t gcd, int prev) +{ + int i = prev; + uint16_t w; + + while (1) { + uint16_t q; + uint16_t d; + + i = (i + 1) % n; + if (i == 0) { + *cw = *cw - gcd; + if (*cw <= 0) + *cw = max_wt; + } + + q = eth_rx_poll[i].eth_rx_qid; + d = eth_rx_poll[i].eth_dev_id; + w = rx_adapter->eth_devices[d].rx_queue[q].wt; + + if ((int)w >= *cw) + return i; + } +} + +static inline int +rxa_shared_intr(struct eth_device_info *dev_info, + int rx_queue_id) +{ + int multi_intr_cap; + + if (dev_info->dev->intr_handle == NULL) + return 0; + + multi_intr_cap = rte_intr_cap_multiple(dev_info->dev->intr_handle); + return !multi_intr_cap || + rx_queue_id >= RTE_MAX_RXTX_INTR_VEC_ID - 1; +} + +static inline int +rxa_intr_queue(struct eth_device_info *dev_info, + int rx_queue_id) +{ + struct eth_rx_queue_info *queue_info; + + queue_info = &dev_info->rx_queue[rx_queue_id]; + return dev_info->rx_queue && + !dev_info->internal_event_port && + queue_info->queue_enabled && queue_info->wt == 0; +} + +static inline int +rxa_polled_queue(struct eth_device_info *dev_info, + int rx_queue_id) +{ + struct eth_rx_queue_info *queue_info; + + queue_info = &dev_info->rx_queue[rx_queue_id]; + return !dev_info->internal_event_port && + dev_info->rx_queue && + queue_info->queue_enabled && queue_info->wt != 0; +} + +/* Calculate change in number of vectors after Rx queue ID is add/deleted */ +static int +rxa_nb_intr_vect(struct eth_device_info *dev_info, int rx_queue_id, int add) +{ + uint16_t i; + int n, s; + uint16_t nbq; + + nbq = dev_info->dev->data->nb_rx_queues; + n = 0; /* non shared count */ + s = 0; /* shared count */ + + if (rx_queue_id == -1) { + for (i = 0; i < nbq; i++) { + if (!rxa_shared_intr(dev_info, i)) + n += add ? !rxa_intr_queue(dev_info, i) : + rxa_intr_queue(dev_info, i); + else + s += add ? !rxa_intr_queue(dev_info, i) : + rxa_intr_queue(dev_info, i); + } + + if (s > 0) { + if ((add && dev_info->nb_shared_intr == 0) || + (!add && dev_info->nb_shared_intr)) + n += 1; + } + } else { + if (!rxa_shared_intr(dev_info, rx_queue_id)) + n = add ? !rxa_intr_queue(dev_info, rx_queue_id) : + rxa_intr_queue(dev_info, rx_queue_id); + else + n = add ? !dev_info->nb_shared_intr : + dev_info->nb_shared_intr == 1; + } + + return add ? n : -n; +} + +/* Calculate nb_rx_intr after deleting interrupt mode rx queues + */ +static void +rxa_calc_nb_post_intr_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_intr) +{ + uint32_t intr_diff; + + if (rx_queue_id == -1) + intr_diff = dev_info->nb_rx_intr; + else + intr_diff = rxa_intr_queue(dev_info, rx_queue_id); + + *nb_rx_intr = rx_adapter->num_rx_intr - intr_diff; +} + +/* Calculate nb_rx_* after adding interrupt mode rx queues, newly added + * interrupt queues could currently be poll mode Rx queues + */ +static void +rxa_calc_nb_post_add_intr(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + uint32_t intr_diff; + uint32_t poll_diff; + uint32_t wrr_len_diff; + + if (rx_queue_id == -1) { + intr_diff = dev_info->dev->data->nb_rx_queues - + dev_info->nb_rx_intr; + poll_diff = dev_info->nb_rx_poll; + wrr_len_diff = dev_info->wrr_len; + } else { + intr_diff = !rxa_intr_queue(dev_info, rx_queue_id); + poll_diff = rxa_polled_queue(dev_info, rx_queue_id); + wrr_len_diff = poll_diff ? dev_info->rx_queue[rx_queue_id].wt : + 0; + } + + *nb_rx_intr = rx_adapter->num_rx_intr + intr_diff; + *nb_rx_poll = rx_adapter->num_rx_polled - poll_diff; + *nb_wrr = rx_adapter->wrr_len - wrr_len_diff; +} + +/* Calculate size of the eth_rx_poll and wrr_sched arrays + * after deleting poll mode rx queues + */ +static void +rxa_calc_nb_post_poll_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_poll, + uint32_t *nb_wrr) +{ + uint32_t poll_diff; + uint32_t wrr_len_diff; + + if (rx_queue_id == -1) { + poll_diff = dev_info->nb_rx_poll; + wrr_len_diff = dev_info->wrr_len; + } else { + poll_diff = rxa_polled_queue(dev_info, rx_queue_id); + wrr_len_diff = poll_diff ? dev_info->rx_queue[rx_queue_id].wt : + 0; + } + + *nb_rx_poll = rx_adapter->num_rx_polled - poll_diff; + *nb_wrr = rx_adapter->wrr_len - wrr_len_diff; +} + +/* Calculate nb_rx_* after adding poll mode rx queues + */ +static void +rxa_calc_nb_post_add_poll(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint16_t wt, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + uint32_t intr_diff; + uint32_t poll_diff; + uint32_t wrr_len_diff; + + if (rx_queue_id == -1) { + intr_diff = dev_info->nb_rx_intr; + poll_diff = dev_info->dev->data->nb_rx_queues - + dev_info->nb_rx_poll; + wrr_len_diff = wt*dev_info->dev->data->nb_rx_queues + - dev_info->wrr_len; + } else { + intr_diff = rxa_intr_queue(dev_info, rx_queue_id); + poll_diff = !rxa_polled_queue(dev_info, rx_queue_id); + wrr_len_diff = rxa_polled_queue(dev_info, rx_queue_id) ? + wt - dev_info->rx_queue[rx_queue_id].wt : + wt; + } + + *nb_rx_poll = rx_adapter->num_rx_polled + poll_diff; + *nb_rx_intr = rx_adapter->num_rx_intr - intr_diff; + *nb_wrr = rx_adapter->wrr_len + wrr_len_diff; +} + +/* Calculate nb_rx_* after adding rx_queue_id */ +static void +rxa_calc_nb_post_add(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint16_t wt, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + if (wt != 0) + rxa_calc_nb_post_add_poll(rx_adapter, dev_info, rx_queue_id, + wt, nb_rx_poll, nb_rx_intr, nb_wrr); + else + rxa_calc_nb_post_add_intr(rx_adapter, dev_info, rx_queue_id, + nb_rx_poll, nb_rx_intr, nb_wrr); +} + +/* Calculate nb_rx_* after deleting rx_queue_id */ +static void +rxa_calc_nb_post_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id, + uint32_t *nb_rx_poll, + uint32_t *nb_rx_intr, + uint32_t *nb_wrr) +{ + rxa_calc_nb_post_poll_del(rx_adapter, dev_info, rx_queue_id, nb_rx_poll, + nb_wrr); + rxa_calc_nb_post_intr_del(rx_adapter, dev_info, rx_queue_id, + nb_rx_intr); +} + +/* + * Allocate the rx_poll array + */ +static struct eth_rx_poll_entry * +rxa_alloc_poll(struct rte_event_eth_rx_adapter *rx_adapter, + uint32_t num_rx_polled) +{ + size_t len; + + len = RTE_ALIGN(num_rx_polled * sizeof(*rx_adapter->eth_rx_poll), + RTE_CACHE_LINE_SIZE); + return rte_zmalloc_socket(rx_adapter->mem_name, + len, + RTE_CACHE_LINE_SIZE, + rx_adapter->socket_id); +} + +/* + * Allocate the WRR array + */ +static uint32_t * +rxa_alloc_wrr(struct rte_event_eth_rx_adapter *rx_adapter, int nb_wrr) +{ + size_t len; + + len = RTE_ALIGN(nb_wrr * sizeof(*rx_adapter->wrr_sched), + RTE_CACHE_LINE_SIZE); + return rte_zmalloc_socket(rx_adapter->mem_name, + len, + RTE_CACHE_LINE_SIZE, + rx_adapter->socket_id); +} + +static int +rxa_alloc_poll_arrays(struct rte_event_eth_rx_adapter *rx_adapter, + uint32_t nb_poll, + uint32_t nb_wrr, + struct eth_rx_poll_entry **rx_poll, + uint32_t **wrr_sched) +{ + + if (nb_poll == 0) { + *rx_poll = NULL; + *wrr_sched = NULL; + return 0; + } + + *rx_poll = rxa_alloc_poll(rx_adapter, nb_poll); + if (*rx_poll == NULL) { + *wrr_sched = NULL; + return -ENOMEM; + } + + *wrr_sched = rxa_alloc_wrr(rx_adapter, nb_wrr); + if (*wrr_sched == NULL) { + rte_free(*rx_poll); + return -ENOMEM; + } + return 0; +} + +/* Precalculate WRR polling sequence for all queues in rx_adapter */ +static void +rxa_calc_wrr_sequence(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_rx_poll_entry *rx_poll, + uint32_t *rx_wrr) +{ + uint16_t d; + uint16_t q; + unsigned int i; + int prev = -1; + int cw = -1; + + /* Initialize variables for calculation of wrr schedule */ + uint16_t max_wrr_pos = 0; + unsigned int poll_q = 0; + uint16_t max_wt = 0; + uint16_t gcd = 0; + + if (rx_poll == NULL) + return; + + /* Generate array of all queues to poll, the size of this + * array is poll_q + */ + RTE_ETH_FOREACH_DEV(d) { + uint16_t nb_rx_queues; + struct eth_device_info *dev_info = + &rx_adapter->eth_devices[d]; + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + if (dev_info->rx_queue == NULL) + continue; + if (dev_info->internal_event_port) + continue; + dev_info->wrr_len = 0; + for (q = 0; q < nb_rx_queues; q++) { + struct eth_rx_queue_info *queue_info = + &dev_info->rx_queue[q]; + uint16_t wt; + + if (!rxa_polled_queue(dev_info, q)) + continue; + wt = queue_info->wt; + rx_poll[poll_q].eth_dev_id = d; + rx_poll[poll_q].eth_rx_qid = q; + max_wrr_pos += wt; + dev_info->wrr_len += wt; + max_wt = RTE_MAX(max_wt, wt); + gcd = (gcd) ? rxa_gcd_u16(gcd, wt) : wt; + poll_q++; + } + } + + /* Generate polling sequence based on weights */ + prev = -1; + cw = -1; + for (i = 0; i < max_wrr_pos; i++) { + rx_wrr[i] = rxa_wrr_next(rx_adapter, poll_q, &cw, + rx_poll, max_wt, gcd, prev); + prev = rx_wrr[i]; + } +} + +static inline void +rxa_mtoip(struct rte_mbuf *m, struct ipv4_hdr **ipv4_hdr, + struct ipv6_hdr **ipv6_hdr) +{ + struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + struct vlan_hdr *vlan_hdr; + + *ipv4_hdr = NULL; + *ipv6_hdr = NULL; + + switch (eth_hdr->ether_type) { + case RTE_BE16(ETHER_TYPE_IPv4): + *ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1); + break; + + case RTE_BE16(ETHER_TYPE_IPv6): + *ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1); + break; + + case RTE_BE16(ETHER_TYPE_VLAN): + vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + switch (vlan_hdr->eth_proto) { + case RTE_BE16(ETHER_TYPE_IPv4): + *ipv4_hdr = (struct ipv4_hdr *)(vlan_hdr + 1); + break; + case RTE_BE16(ETHER_TYPE_IPv6): + *ipv6_hdr = (struct ipv6_hdr *)(vlan_hdr + 1); + break; + default: + break; + } + break; + + default: + break; + } +} + +/* Calculate RSS hash for IPv4/6 */ +static inline uint32_t +rxa_do_softrss(struct rte_mbuf *m, const uint8_t *rss_key_be) +{ + uint32_t input_len; + void *tuple; + struct rte_ipv4_tuple ipv4_tuple; + struct rte_ipv6_tuple ipv6_tuple; + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + + rxa_mtoip(m, &ipv4_hdr, &ipv6_hdr); + + if (ipv4_hdr) { + ipv4_tuple.src_addr = rte_be_to_cpu_32(ipv4_hdr->src_addr); + ipv4_tuple.dst_addr = rte_be_to_cpu_32(ipv4_hdr->dst_addr); + tuple = &ipv4_tuple; + input_len = RTE_THASH_V4_L3_LEN; + } else if (ipv6_hdr) { + rte_thash_load_v6_addrs(ipv6_hdr, + (union rte_thash_tuple *)&ipv6_tuple); + tuple = &ipv6_tuple; + input_len = RTE_THASH_V6_L3_LEN; + } else + return 0; + + return rte_softrss_be(tuple, input_len, rss_key_be); +} + +static inline int +rxa_enq_blocked(struct rte_event_eth_rx_adapter *rx_adapter) +{ + return !!rx_adapter->enq_block_count; +} + +static inline void +rxa_enq_block_start_ts(struct rte_event_eth_rx_adapter *rx_adapter) +{ + if (rx_adapter->rx_enq_block_start_ts) + return; + + rx_adapter->enq_block_count++; + if (rx_adapter->enq_block_count < BLOCK_CNT_THRESHOLD) + return; + + rx_adapter->rx_enq_block_start_ts = rte_get_tsc_cycles(); +} + +static inline void +rxa_enq_block_end_ts(struct rte_event_eth_rx_adapter *rx_adapter, + struct rte_event_eth_rx_adapter_stats *stats) +{ + if (unlikely(!stats->rx_enq_start_ts)) + stats->rx_enq_start_ts = rte_get_tsc_cycles(); + + if (likely(!rxa_enq_blocked(rx_adapter))) + return; + + rx_adapter->enq_block_count = 0; + if (rx_adapter->rx_enq_block_start_ts) { + stats->rx_enq_end_ts = rte_get_tsc_cycles(); + stats->rx_enq_block_cycles += stats->rx_enq_end_ts - + rx_adapter->rx_enq_block_start_ts; + rx_adapter->rx_enq_block_start_ts = 0; + } +} + +/* Add event to buffer, free space check is done prior to calling + * this function + */ +static inline void +rxa_buffer_event(struct rte_event_eth_rx_adapter *rx_adapter, + struct rte_event *ev) +{ + struct rte_eth_event_enqueue_buffer *buf = + &rx_adapter->event_enqueue_buffer; + rte_memcpy(&buf->events[buf->count++], ev, sizeof(struct rte_event)); +} + +/* Enqueue buffered events to event device */ +static inline uint16_t +rxa_flush_event_buffer(struct rte_event_eth_rx_adapter *rx_adapter) +{ + struct rte_eth_event_enqueue_buffer *buf = + &rx_adapter->event_enqueue_buffer; + struct rte_event_eth_rx_adapter_stats *stats = &rx_adapter->stats; + + uint16_t n = rte_event_enqueue_new_burst(rx_adapter->eventdev_id, + rx_adapter->event_port_id, + buf->events, + buf->count); + if (n != buf->count) { + memmove(buf->events, + &buf->events[n], + (buf->count - n) * sizeof(struct rte_event)); + stats->rx_enq_retry++; + } + + n ? rxa_enq_block_end_ts(rx_adapter, stats) : + rxa_enq_block_start_ts(rx_adapter); + + buf->count -= n; + stats->rx_enq_count += n; + + return n; +} + +static inline void +rxa_buffer_mbufs(struct rte_event_eth_rx_adapter *rx_adapter, + uint16_t eth_dev_id, + uint16_t rx_queue_id, + struct rte_mbuf **mbufs, + uint16_t num) +{ + uint32_t i; + struct eth_device_info *dev_info = + &rx_adapter->eth_devices[eth_dev_id]; + struct eth_rx_queue_info *eth_rx_queue_info = + &dev_info->rx_queue[rx_queue_id]; + struct rte_eth_event_enqueue_buffer *buf = + &rx_adapter->event_enqueue_buffer; + int32_t qid = eth_rx_queue_info->event_queue_id; + uint8_t sched_type = eth_rx_queue_info->sched_type; + uint8_t priority = eth_rx_queue_info->priority; + uint32_t flow_id; + struct rte_event events[BATCH_SIZE]; + struct rte_mbuf *m = mbufs[0]; + uint32_t rss_mask; + uint32_t rss; + int do_rss; + uint64_t ts; + struct rte_mbuf *cb_mbufs[BATCH_SIZE]; + uint16_t nb_cb; + + /* 0xffff ffff if PKT_RX_RSS_HASH is set, otherwise 0 */ + rss_mask = ~(((m->ol_flags & PKT_RX_RSS_HASH) != 0) - 1); + do_rss = !rss_mask && !eth_rx_queue_info->flow_id_mask; + + if ((m->ol_flags & PKT_RX_TIMESTAMP) == 0) { + ts = rte_get_tsc_cycles(); + for (i = 0; i < num; i++) { + m = mbufs[i]; + + m->timestamp = ts; + m->ol_flags |= PKT_RX_TIMESTAMP; + } + } + + + nb_cb = dev_info->cb_fn ? dev_info->cb_fn(eth_dev_id, rx_queue_id, + ETH_EVENT_BUFFER_SIZE, + buf->count, mbufs, + num, + dev_info->cb_arg, + cb_mbufs) : + num; + if (nb_cb < num) { + mbufs = cb_mbufs; + num = nb_cb; + } + + for (i = 0; i < num; i++) { + m = mbufs[i]; + struct rte_event *ev = &events[i]; + + rss = do_rss ? + rxa_do_softrss(m, rx_adapter->rss_key_be) : + m->hash.rss; + flow_id = + eth_rx_queue_info->flow_id & + eth_rx_queue_info->flow_id_mask; + flow_id |= rss & ~eth_rx_queue_info->flow_id_mask; + ev->flow_id = flow_id; + ev->op = RTE_EVENT_OP_NEW; + ev->sched_type = sched_type; + ev->queue_id = qid; + ev->event_type = RTE_EVENT_TYPE_ETH_RX_ADAPTER; + ev->sub_event_type = 0; + ev->priority = priority; + ev->mbuf = m; + + rxa_buffer_event(rx_adapter, ev); + } +} + +/* Enqueue packets from to event buffer */ +static inline uint32_t +rxa_eth_rx(struct rte_event_eth_rx_adapter *rx_adapter, + uint16_t port_id, + uint16_t queue_id, + uint32_t rx_count, + uint32_t max_rx, + int *rxq_empty) +{ + struct rte_mbuf *mbufs[BATCH_SIZE]; + struct rte_eth_event_enqueue_buffer *buf = + &rx_adapter->event_enqueue_buffer; + struct rte_event_eth_rx_adapter_stats *stats = + &rx_adapter->stats; + uint16_t n; + uint32_t nb_rx = 0; + + if (rxq_empty) + *rxq_empty = 0; + /* Don't do a batch dequeue from the rx queue if there isn't + * enough space in the enqueue buffer. + */ + while (BATCH_SIZE <= (RTE_DIM(buf->events) - buf->count)) { + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + + stats->rx_poll_count++; + n = rte_eth_rx_burst(port_id, queue_id, mbufs, BATCH_SIZE); + if (unlikely(!n)) { + if (rxq_empty) + *rxq_empty = 1; + break; + } + rxa_buffer_mbufs(rx_adapter, port_id, queue_id, mbufs, n); + nb_rx += n; + if (rx_count + nb_rx > max_rx) + break; + } + + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + + return nb_rx; +} + +static inline void +rxa_intr_ring_enqueue(struct rte_event_eth_rx_adapter *rx_adapter, + void *data) +{ + uint16_t port_id; + uint16_t queue; + int err; + union queue_data qd; + struct eth_device_info *dev_info; + struct eth_rx_queue_info *queue_info; + int *intr_enabled; + + qd.ptr = data; + port_id = qd.port; + queue = qd.queue; + + dev_info = &rx_adapter->eth_devices[port_id]; + queue_info = &dev_info->rx_queue[queue]; + rte_spinlock_lock(&rx_adapter->intr_ring_lock); + if (rxa_shared_intr(dev_info, queue)) + intr_enabled = &dev_info->shared_intr_enabled; + else + intr_enabled = &queue_info->intr_enabled; + + if (*intr_enabled) { + *intr_enabled = 0; + err = rte_ring_enqueue(rx_adapter->intr_ring, data); + /* Entry should always be available. + * The ring size equals the maximum number of interrupt + * vectors supported (an interrupt vector is shared in + * case of shared interrupts) + */ + if (err) + RTE_EDEV_LOG_ERR("Failed to enqueue interrupt" + " to ring: %s", strerror(err)); + else + rte_eth_dev_rx_intr_disable(port_id, queue); + } + rte_spinlock_unlock(&rx_adapter->intr_ring_lock); +} + +static int +rxa_intr_ring_check_avail(struct rte_event_eth_rx_adapter *rx_adapter, + uint32_t num_intr_vec) +{ + if (rx_adapter->num_intr_vec + num_intr_vec > + RTE_EVENT_ETH_INTR_RING_SIZE) { + RTE_EDEV_LOG_ERR("Exceeded intr ring slots current" + " %d needed %d limit %d", rx_adapter->num_intr_vec, + num_intr_vec, RTE_EVENT_ETH_INTR_RING_SIZE); + return -ENOSPC; + } + + return 0; +} + +/* Delete entries for (dev, queue) from the interrupt ring */ +static void +rxa_intr_ring_del_entries(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + uint16_t rx_queue_id) +{ + int i, n; + union queue_data qd; + + rte_spinlock_lock(&rx_adapter->intr_ring_lock); + + n = rte_ring_count(rx_adapter->intr_ring); + for (i = 0; i < n; i++) { + rte_ring_dequeue(rx_adapter->intr_ring, &qd.ptr); + if (!rxa_shared_intr(dev_info, rx_queue_id)) { + if (qd.port == dev_info->dev->data->port_id && + qd.queue == rx_queue_id) + continue; + } else { + if (qd.port == dev_info->dev->data->port_id) + continue; + } + rte_ring_enqueue(rx_adapter->intr_ring, qd.ptr); + } + + rte_spinlock_unlock(&rx_adapter->intr_ring_lock); +} + +/* pthread callback handling interrupt mode receive queues + * After receiving an Rx interrupt, it enqueues the port id and queue id of the + * interrupting queue to the adapter's ring buffer for interrupt events. + * These events are picked up by rxa_intr_ring_dequeue() which is invoked from + * the adapter service function. + */ +static void * +rxa_intr_thread(void *arg) +{ + struct rte_event_eth_rx_adapter *rx_adapter = arg; + struct rte_epoll_event *epoll_events = rx_adapter->epoll_events; + int n, i; + + while (1) { + n = rte_epoll_wait(rx_adapter->epd, epoll_events, + RTE_EVENT_ETH_INTR_RING_SIZE, -1); + if (unlikely(n < 0)) + RTE_EDEV_LOG_ERR("rte_epoll_wait returned error %d", + n); + for (i = 0; i < n; i++) { + rxa_intr_ring_enqueue(rx_adapter, + epoll_events[i].epdata.data); + } + } + + return NULL; +} + +/* Dequeue from interrupt ring and enqueue received + * mbufs to eventdev + */ +static inline uint32_t +rxa_intr_ring_dequeue(struct rte_event_eth_rx_adapter *rx_adapter) +{ + uint32_t n; + uint32_t nb_rx = 0; + int rxq_empty; + struct rte_eth_event_enqueue_buffer *buf; + rte_spinlock_t *ring_lock; + uint8_t max_done = 0; + + if (rx_adapter->num_rx_intr == 0) + return 0; + + if (rte_ring_count(rx_adapter->intr_ring) == 0 + && !rx_adapter->qd_valid) + return 0; + + buf = &rx_adapter->event_enqueue_buffer; + ring_lock = &rx_adapter->intr_ring_lock; + + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + + while (BATCH_SIZE <= (RTE_DIM(buf->events) - buf->count)) { + struct eth_device_info *dev_info; + uint16_t port; + uint16_t queue; + union queue_data qd = rx_adapter->qd; + int err; + + if (!rx_adapter->qd_valid) { + struct eth_rx_queue_info *queue_info; + + rte_spinlock_lock(ring_lock); + err = rte_ring_dequeue(rx_adapter->intr_ring, &qd.ptr); + if (err) { + rte_spinlock_unlock(ring_lock); + break; + } + + port = qd.port; + queue = qd.queue; + rx_adapter->qd = qd; + rx_adapter->qd_valid = 1; + dev_info = &rx_adapter->eth_devices[port]; + if (rxa_shared_intr(dev_info, queue)) + dev_info->shared_intr_enabled = 1; + else { + queue_info = &dev_info->rx_queue[queue]; + queue_info->intr_enabled = 1; + } + rte_eth_dev_rx_intr_enable(port, queue); + rte_spinlock_unlock(ring_lock); + } else { + port = qd.port; + queue = qd.queue; + + dev_info = &rx_adapter->eth_devices[port]; + } + + if (rxa_shared_intr(dev_info, queue)) { + uint16_t i; + uint16_t nb_queues; + + nb_queues = dev_info->dev->data->nb_rx_queues; + n = 0; + for (i = dev_info->next_q_idx; i < nb_queues; i++) { + uint8_t enq_buffer_full; + + if (!rxa_intr_queue(dev_info, i)) + continue; + n = rxa_eth_rx(rx_adapter, port, i, nb_rx, + rx_adapter->max_nb_rx, + &rxq_empty); + nb_rx += n; + + enq_buffer_full = !rxq_empty && n == 0; + max_done = nb_rx > rx_adapter->max_nb_rx; + + if (enq_buffer_full || max_done) { + dev_info->next_q_idx = i; + goto done; + } + } + + rx_adapter->qd_valid = 0; + + /* Reinitialize for next interrupt */ + dev_info->next_q_idx = dev_info->multi_intr_cap ? + RTE_MAX_RXTX_INTR_VEC_ID - 1 : + 0; + } else { + n = rxa_eth_rx(rx_adapter, port, queue, nb_rx, + rx_adapter->max_nb_rx, + &rxq_empty); + rx_adapter->qd_valid = !rxq_empty; + nb_rx += n; + if (nb_rx > rx_adapter->max_nb_rx) + break; + } + } + +done: + rx_adapter->stats.rx_intr_packets += nb_rx; + return nb_rx; +} + +/* + * Polls receive queues added to the event adapter and enqueues received + * packets to the event device. + * + * The receive code enqueues initially to a temporary buffer, the + * temporary buffer is drained anytime it holds >= BATCH_SIZE packets + * + * If there isn't space available in the temporary buffer, packets from the + * Rx queue aren't dequeued from the eth device, this back pressures the + * eth device, in virtual device environments this back pressure is relayed to + * the hypervisor's switching layer where adjustments can be made to deal with + * it. + */ +static inline uint32_t +rxa_poll(struct rte_event_eth_rx_adapter *rx_adapter) +{ + uint32_t num_queue; + uint32_t nb_rx = 0; + struct rte_eth_event_enqueue_buffer *buf; + uint32_t wrr_pos; + uint32_t max_nb_rx; + + wrr_pos = rx_adapter->wrr_pos; + max_nb_rx = rx_adapter->max_nb_rx; + buf = &rx_adapter->event_enqueue_buffer; + stats = &rx_adapter->stats; + + /* Iterate through a WRR sequence */ + for (num_queue = 0; num_queue < rx_adapter->wrr_len; num_queue++) { + unsigned int poll_idx = rx_adapter->wrr_sched[wrr_pos]; + uint16_t qid = rx_adapter->eth_rx_poll[poll_idx].eth_rx_qid; + uint16_t d = rx_adapter->eth_rx_poll[poll_idx].eth_dev_id; + + /* Don't do a batch dequeue from the rx queue if there isn't + * enough space in the enqueue buffer. + */ + if (buf->count >= BATCH_SIZE) + rxa_flush_event_buffer(rx_adapter); + if (BATCH_SIZE > (ETH_EVENT_BUFFER_SIZE - buf->count)) { + rx_adapter->wrr_pos = wrr_pos; + return nb_rx; + } + + nb_rx += rxa_eth_rx(rx_adapter, d, qid, nb_rx, max_nb_rx, + NULL); + if (nb_rx > max_nb_rx) { + rx_adapter->wrr_pos = + (wrr_pos + 1) % rx_adapter->wrr_len; + break; + } + + if (++wrr_pos == rx_adapter->wrr_len) + wrr_pos = 0; + } + return nb_rx; +} + +static int +rxa_service_func(void *args) +{ + struct rte_event_eth_rx_adapter *rx_adapter = args; + struct rte_event_eth_rx_adapter_stats *stats; + + if (rte_spinlock_trylock(&rx_adapter->rx_lock) == 0) + return 0; + if (!rx_adapter->rxa_started) { + return 0; + rte_spinlock_unlock(&rx_adapter->rx_lock); + } + + stats = &rx_adapter->stats; + stats->rx_packets += rxa_intr_ring_dequeue(rx_adapter); + stats->rx_packets += rxa_poll(rx_adapter); + rte_spinlock_unlock(&rx_adapter->rx_lock); + return 0; +} + +static int +rte_event_eth_rx_adapter_init(void) +{ + const char *name = "rte_event_eth_rx_adapter_array"; + const struct rte_memzone *mz; + unsigned int sz; + + sz = sizeof(*event_eth_rx_adapter) * + RTE_EVENT_ETH_RX_ADAPTER_MAX_INSTANCE; + sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE); + + mz = rte_memzone_lookup(name); + if (mz == NULL) { + mz = rte_memzone_reserve_aligned(name, sz, rte_socket_id(), 0, + RTE_CACHE_LINE_SIZE); + if (mz == NULL) { + RTE_EDEV_LOG_ERR("failed to reserve memzone err = %" + PRId32, rte_errno); + return -rte_errno; + } + } + + event_eth_rx_adapter = mz->addr; + return 0; +} + +static inline struct rte_event_eth_rx_adapter * +rxa_id_to_adapter(uint8_t id) +{ + return event_eth_rx_adapter ? + event_eth_rx_adapter[id] : NULL; +} + +static int +rxa_default_conf_cb(uint8_t id, uint8_t dev_id, + struct rte_event_eth_rx_adapter_conf *conf, void *arg) +{ + int ret; + struct rte_eventdev *dev; + struct rte_event_dev_config dev_conf; + int started; + uint8_t port_id; + struct rte_event_port_conf *port_conf = arg; + struct rte_event_eth_rx_adapter *rx_adapter = rxa_id_to_adapter(id); + + dev = &rte_eventdevs[rx_adapter->eventdev_id]; + dev_conf = dev->data->dev_conf; + + started = dev->data->dev_started; + if (started) + rte_event_dev_stop(dev_id); + port_id = dev_conf.nb_event_ports; + dev_conf.nb_event_ports += 1; + ret = rte_event_dev_configure(dev_id, &dev_conf); + if (ret) { + RTE_EDEV_LOG_ERR("failed to configure event dev %u\n", + dev_id); + if (started) { + if (rte_event_dev_start(dev_id)) + return -EIO; + } + return ret; + } + + ret = rte_event_port_setup(dev_id, port_id, port_conf); + if (ret) { + RTE_EDEV_LOG_ERR("failed to setup event port %u\n", + port_id); + return ret; + } + + conf->event_port_id = port_id; + conf->max_nb_rx = 128; + if (started) + ret = rte_event_dev_start(dev_id); + rx_adapter->default_cb_arg = 1; + return ret; +} + +static int +rxa_epoll_create1(void) +{ +#if defined(LINUX) + int fd; + fd = epoll_create1(EPOLL_CLOEXEC); + return fd < 0 ? -errno : fd; +#elif defined(BSD) + return -ENOTSUP; +#endif +} + +static int +rxa_init_epd(struct rte_event_eth_rx_adapter *rx_adapter) +{ + if (rx_adapter->epd != INIT_FD) + return 0; + + rx_adapter->epd = rxa_epoll_create1(); + if (rx_adapter->epd < 0) { + int err = rx_adapter->epd; + rx_adapter->epd = INIT_FD; + RTE_EDEV_LOG_ERR("epoll_create1() failed, err %d", err); + return err; + } + + return 0; +} + +static int +rxa_create_intr_thread(struct rte_event_eth_rx_adapter *rx_adapter) +{ + int err; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (rx_adapter->intr_ring) + return 0; + + rx_adapter->intr_ring = rte_ring_create("intr_ring", + RTE_EVENT_ETH_INTR_RING_SIZE, + rte_socket_id(), 0); + if (!rx_adapter->intr_ring) + return -ENOMEM; + + rx_adapter->epoll_events = rte_zmalloc_socket(rx_adapter->mem_name, + RTE_EVENT_ETH_INTR_RING_SIZE * + sizeof(struct rte_epoll_event), + RTE_CACHE_LINE_SIZE, + rx_adapter->socket_id); + if (!rx_adapter->epoll_events) { + err = -ENOMEM; + goto error; + } + + rte_spinlock_init(&rx_adapter->intr_ring_lock); + + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + "rx-intr-thread-%d", rx_adapter->id); + + err = rte_ctrl_thread_create(&rx_adapter->rx_intr_thread, thread_name, + NULL, rxa_intr_thread, rx_adapter); + if (!err) { + rte_thread_setname(rx_adapter->rx_intr_thread, thread_name); + return 0; + } + + RTE_EDEV_LOG_ERR("Failed to create interrupt thread err = %d\n", err); +error: + rte_ring_free(rx_adapter->intr_ring); + rx_adapter->intr_ring = NULL; + rx_adapter->epoll_events = NULL; + return err; +} + +static int +rxa_destroy_intr_thread(struct rte_event_eth_rx_adapter *rx_adapter) +{ + int err; + + err = pthread_cancel(rx_adapter->rx_intr_thread); + if (err) + RTE_EDEV_LOG_ERR("Can't cancel interrupt thread err = %d\n", + err); + + err = pthread_join(rx_adapter->rx_intr_thread, NULL); + if (err) + RTE_EDEV_LOG_ERR("Can't join interrupt thread err = %d\n", err); + + rte_free(rx_adapter->epoll_events); + rte_ring_free(rx_adapter->intr_ring); + rx_adapter->intr_ring = NULL; + rx_adapter->epoll_events = NULL; + return 0; +} + +static int +rxa_free_intr_resources(struct rte_event_eth_rx_adapter *rx_adapter) +{ + int ret; + + if (rx_adapter->num_rx_intr == 0) + return 0; + + ret = rxa_destroy_intr_thread(rx_adapter); + if (ret) + return ret; + + close(rx_adapter->epd); + rx_adapter->epd = INIT_FD; + + return ret; +} + +static int +rxa_disable_intr(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + uint16_t rx_queue_id) +{ + int err; + uint16_t eth_dev_id = dev_info->dev->data->port_id; + int sintr = rxa_shared_intr(dev_info, rx_queue_id); + + err = rte_eth_dev_rx_intr_disable(eth_dev_id, rx_queue_id); + if (err) { + RTE_EDEV_LOG_ERR("Could not disable interrupt for Rx queue %u", + rx_queue_id); + return err; + } + + err = rte_eth_dev_rx_intr_ctl_q(eth_dev_id, rx_queue_id, + rx_adapter->epd, + RTE_INTR_EVENT_DEL, + 0); + if (err) + RTE_EDEV_LOG_ERR("Interrupt event deletion failed %d", err); + + if (sintr) + dev_info->rx_queue[rx_queue_id].intr_enabled = 0; + else + dev_info->shared_intr_enabled = 0; + return err; +} + +static int +rxa_del_intr_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id) +{ + int err; + int i; + int s; + + if (dev_info->nb_rx_intr == 0) + return 0; + + err = 0; + if (rx_queue_id == -1) { + s = dev_info->nb_shared_intr; + for (i = 0; i < dev_info->nb_rx_intr; i++) { + int sintr; + uint16_t q; + + q = dev_info->intr_queue[i]; + sintr = rxa_shared_intr(dev_info, q); + s -= sintr; + + if (!sintr || s == 0) { + + err = rxa_disable_intr(rx_adapter, dev_info, + q); + if (err) + return err; + rxa_intr_ring_del_entries(rx_adapter, dev_info, + q); + } + } + } else { + if (!rxa_intr_queue(dev_info, rx_queue_id)) + return 0; + if (!rxa_shared_intr(dev_info, rx_queue_id) || + dev_info->nb_shared_intr == 1) { + err = rxa_disable_intr(rx_adapter, dev_info, + rx_queue_id); + if (err) + return err; + rxa_intr_ring_del_entries(rx_adapter, dev_info, + rx_queue_id); + } + + for (i = 0; i < dev_info->nb_rx_intr; i++) { + if (dev_info->intr_queue[i] == rx_queue_id) { + for (; i < dev_info->nb_rx_intr - 1; i++) + dev_info->intr_queue[i] = + dev_info->intr_queue[i + 1]; + break; + } + } + } + + return err; +} + +static int +rxa_config_intr(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + uint16_t rx_queue_id) +{ + int err, err1; + uint16_t eth_dev_id = dev_info->dev->data->port_id; + union queue_data qd; + int init_fd; + uint16_t *intr_queue; + int sintr = rxa_shared_intr(dev_info, rx_queue_id); + + if (rxa_intr_queue(dev_info, rx_queue_id)) + return 0; + + intr_queue = dev_info->intr_queue; + if (dev_info->intr_queue == NULL) { + size_t len = + dev_info->dev->data->nb_rx_queues * sizeof(uint16_t); + dev_info->intr_queue = + rte_zmalloc_socket( + rx_adapter->mem_name, + len, + 0, + rx_adapter->socket_id); + if (dev_info->intr_queue == NULL) + return -ENOMEM; + } + + init_fd = rx_adapter->epd; + err = rxa_init_epd(rx_adapter); + if (err) + goto err_free_queue; + + qd.port = eth_dev_id; + qd.queue = rx_queue_id; + + err = rte_eth_dev_rx_intr_ctl_q(eth_dev_id, rx_queue_id, + rx_adapter->epd, + RTE_INTR_EVENT_ADD, + qd.ptr); + if (err) { + RTE_EDEV_LOG_ERR("Failed to add interrupt event for" + " Rx Queue %u err %d", rx_queue_id, err); + goto err_del_fd; + } + + err = rte_eth_dev_rx_intr_enable(eth_dev_id, rx_queue_id); + if (err) { + RTE_EDEV_LOG_ERR("Could not enable interrupt for" + " Rx Queue %u err %d", rx_queue_id, err); + + goto err_del_event; + } + + err = rxa_create_intr_thread(rx_adapter); + if (!err) { + if (sintr) + dev_info->shared_intr_enabled = 1; + else + dev_info->rx_queue[rx_queue_id].intr_enabled = 1; + return 0; + } + + + err = rte_eth_dev_rx_intr_disable(eth_dev_id, rx_queue_id); + if (err) + RTE_EDEV_LOG_ERR("Could not disable interrupt for" + " Rx Queue %u err %d", rx_queue_id, err); +err_del_event: + err1 = rte_eth_dev_rx_intr_ctl_q(eth_dev_id, rx_queue_id, + rx_adapter->epd, + RTE_INTR_EVENT_DEL, + 0); + if (err1) { + RTE_EDEV_LOG_ERR("Could not delete event for" + " Rx Queue %u err %d", rx_queue_id, err1); + } +err_del_fd: + if (init_fd == INIT_FD) { + close(rx_adapter->epd); + rx_adapter->epd = -1; + } +err_free_queue: + if (intr_queue == NULL) + rte_free(dev_info->intr_queue); + + return err; +} + +static int +rxa_add_intr_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int rx_queue_id) + +{ + int i, j, err; + int si = -1; + int shared_done = (dev_info->nb_shared_intr > 0); + + if (rx_queue_id != -1) { + if (rxa_shared_intr(dev_info, rx_queue_id) && shared_done) + return 0; + return rxa_config_intr(rx_adapter, dev_info, rx_queue_id); + } + + err = 0; + for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) { + + if (rxa_shared_intr(dev_info, i) && shared_done) + continue; + + err = rxa_config_intr(rx_adapter, dev_info, i); + + shared_done = err == 0 && rxa_shared_intr(dev_info, i); + if (shared_done) { + si = i; + dev_info->shared_intr_enabled = 1; + } + if (err) + break; + } + + if (err == 0) + return 0; + + shared_done = (dev_info->nb_shared_intr > 0); + for (j = 0; j < i; j++) { + if (rxa_intr_queue(dev_info, j)) + continue; + if (rxa_shared_intr(dev_info, j) && si != j) + continue; + err = rxa_disable_intr(rx_adapter, dev_info, j); + if (err) + break; + + } + + return err; +} + + +static int +rxa_init_service(struct rte_event_eth_rx_adapter *rx_adapter, uint8_t id) +{ + int ret; + struct rte_service_spec service; + struct rte_event_eth_rx_adapter_conf rx_adapter_conf; + + if (rx_adapter->service_inited) + return 0; + + memset(&service, 0, sizeof(service)); + snprintf(service.name, ETH_RX_ADAPTER_SERVICE_NAME_LEN, + "rte_event_eth_rx_adapter_%d", id); + service.socket_id = rx_adapter->socket_id; + service.callback = rxa_service_func; + service.callback_userdata = rx_adapter; + /* Service function handles locking for queue add/del updates */ + service.capabilities = RTE_SERVICE_CAP_MT_SAFE; + ret = rte_service_component_register(&service, &rx_adapter->service_id); + if (ret) { + RTE_EDEV_LOG_ERR("failed to register service %s err = %" PRId32, + service.name, ret); + return ret; + } + + ret = rx_adapter->conf_cb(id, rx_adapter->eventdev_id, + &rx_adapter_conf, rx_adapter->conf_arg); + if (ret) { + RTE_EDEV_LOG_ERR("configuration callback failed err = %" PRId32, + ret); + goto err_done; + } + rx_adapter->event_port_id = rx_adapter_conf.event_port_id; + rx_adapter->max_nb_rx = rx_adapter_conf.max_nb_rx; + rx_adapter->service_inited = 1; + rx_adapter->epd = INIT_FD; + return 0; + +err_done: + rte_service_component_unregister(rx_adapter->service_id); + return ret; +} + +static void +rxa_update_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int32_t rx_queue_id, + uint8_t add) +{ + struct eth_rx_queue_info *queue_info; + int enabled; + uint16_t i; + + if (dev_info->rx_queue == NULL) + return; + + if (rx_queue_id == -1) { + for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) + rxa_update_queue(rx_adapter, dev_info, i, add); + } else { + queue_info = &dev_info->rx_queue[rx_queue_id]; + enabled = queue_info->queue_enabled; + if (add) { + rx_adapter->nb_queues += !enabled; + dev_info->nb_dev_queues += !enabled; + } else { + rx_adapter->nb_queues -= enabled; + dev_info->nb_dev_queues -= enabled; + } + queue_info->queue_enabled = !!add; + } +} + +static void +rxa_sw_del(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int32_t rx_queue_id) +{ + int pollq; + int intrq; + int sintrq; + + + if (rx_adapter->nb_queues == 0) + return; + + if (rx_queue_id == -1) { + uint16_t nb_rx_queues; + uint16_t i; + + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + for (i = 0; i < nb_rx_queues; i++) + rxa_sw_del(rx_adapter, dev_info, i); + return; + } + + pollq = rxa_polled_queue(dev_info, rx_queue_id); + intrq = rxa_intr_queue(dev_info, rx_queue_id); + sintrq = rxa_shared_intr(dev_info, rx_queue_id); + rxa_update_queue(rx_adapter, dev_info, rx_queue_id, 0); + rx_adapter->num_rx_polled -= pollq; + dev_info->nb_rx_poll -= pollq; + rx_adapter->num_rx_intr -= intrq; + dev_info->nb_rx_intr -= intrq; + dev_info->nb_shared_intr -= intrq && sintrq; +} + +static void +rxa_add_queue(struct rte_event_eth_rx_adapter *rx_adapter, + struct eth_device_info *dev_info, + int32_t rx_queue_id, + const struct rte_event_eth_rx_adapter_queue_conf *conf) +{ + struct eth_rx_queue_info *queue_info; + const struct rte_event *ev = &conf->ev; + int pollq; + int intrq; + int sintrq; + + if (rx_queue_id == -1) { + uint16_t nb_rx_queues; + uint16_t i; + + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + for (i = 0; i < nb_rx_queues; i++) + rxa_add_queue(rx_adapter, dev_info, i, conf); + return; + } + + pollq = rxa_polled_queue(dev_info, rx_queue_id); + intrq = rxa_intr_queue(dev_info, rx_queue_id); + sintrq = rxa_shared_intr(dev_info, rx_queue_id); + + queue_info = &dev_info->rx_queue[rx_queue_id]; + queue_info->event_queue_id = ev->queue_id; + queue_info->sched_type = ev->sched_type; + queue_info->priority = ev->priority; + queue_info->wt = conf->servicing_weight; + + if (conf->rx_queue_flags & + RTE_EVENT_ETH_RX_ADAPTER_QUEUE_FLOW_ID_VALID) { + queue_info->flow_id = ev->flow_id; + queue_info->flow_id_mask = ~0; + } + + rxa_update_queue(rx_adapter, dev_info, rx_queue_id, 1); + if (rxa_polled_queue(dev_info, rx_queue_id)) { + rx_adapter->num_rx_polled += !pollq; + dev_info->nb_rx_poll += !pollq; + rx_adapter->num_rx_intr -= intrq; + dev_info->nb_rx_intr -= intrq; + dev_info->nb_shared_intr -= intrq && sintrq; + } + + if (rxa_intr_queue(dev_info, rx_queue_id)) { + rx_adapter->num_rx_polled -= pollq; + dev_info->nb_rx_poll -= pollq; + rx_adapter->num_rx_intr += !intrq; + dev_info->nb_rx_intr += !intrq; + dev_info->nb_shared_intr += !intrq && sintrq; + if (dev_info->nb_shared_intr == 1) { + if (dev_info->multi_intr_cap) + dev_info->next_q_idx = + RTE_MAX_RXTX_INTR_VEC_ID - 1; + else + dev_info->next_q_idx = 0; + } + } +} + +static int rxa_sw_add(struct rte_event_eth_rx_adapter *rx_adapter, + uint16_t eth_dev_id, + int rx_queue_id, + const struct rte_event_eth_rx_adapter_queue_conf *queue_conf) +{ + struct eth_device_info *dev_info = &rx_adapter->eth_devices[eth_dev_id]; + struct rte_event_eth_rx_adapter_queue_conf temp_conf; + int ret; + struct eth_rx_poll_entry *rx_poll; + struct eth_rx_queue_info *rx_queue; + uint32_t *rx_wrr; + uint16_t nb_rx_queues; + uint32_t nb_rx_poll, nb_wrr; + uint32_t nb_rx_intr; + int num_intr_vec; + uint16_t wt; + + if (queue_conf->servicing_weight == 0) { + struct rte_eth_dev_data *data = dev_info->dev->data; + + temp_conf = *queue_conf; + if (!data->dev_conf.intr_conf.rxq) { + /* If Rx interrupts are disabled set wt = 1 */ + temp_conf.servicing_weight = 1; + } + queue_conf = &temp_conf; + } + + nb_rx_queues = dev_info->dev->data->nb_rx_queues; + rx_queue = dev_info->rx_queue; + wt = queue_conf->servicing_weight; + + if (dev_info->rx_queue == NULL) { + dev_info->rx_queue = + rte_zmalloc_socket(rx_adapter->mem_name, + nb_rx_queues * + sizeof(struct eth_rx_queue_info), 0, + rx_adapter->socket_id); + if (dev_info->rx_queue == NULL) + return -ENOMEM; + } + rx_wrr = NULL; + rx_poll = NULL; + + rxa_calc_nb_post_add(rx_adapter, dev_info, rx_queue_id, + queue_conf->servicing_weight, + &nb_rx_poll, &nb_rx_intr, &nb_wrr); + + if (dev_info->dev->intr_handle) + dev_info->multi_intr_cap = + rte_intr_cap_multiple(dev_info->dev->intr_handle); + + ret = rxa_alloc_poll_arrays(rx_adapter, nb_rx_poll, nb_wrr, + &rx_poll, &rx_wrr); + if (ret) + goto err_free_rxqueue; + + if (wt == 0) { + num_intr_vec = rxa_nb_intr_vect(dev_info, rx_queue_id, 1); + + ret = rxa_intr_ring_check_avail(rx_adapter, num_intr_vec); + if (ret) + goto err_free_rxqueue; + + ret = rxa_add_intr_queue(rx_adapter, dev_info, rx_queue_id); + if (ret) + goto err_free_rxqueue; + } else { + + num_intr_vec = 0; + if (rx_adapter->num_rx_intr > nb_rx_intr) { + num_intr_vec = rxa_nb_intr_vect(dev_info, + rx_queue_id, 0); + /* interrupt based queues are being converted to + * poll mode queues, delete the interrupt configuration + * for those. + */ + ret = rxa_del_intr_queue(rx_adapter, + dev_info, rx_queue_id); + if (ret) + goto err_free_rxqueue; + } + } + + if (nb_rx_intr == 0) { + ret = rxa_free_intr_resources(rx_adapter); + if (ret) + goto err_free_rxqueue; + } + + if (wt == 0) { + uint16_t i; + + if (rx_queue_id == -1) { + for (i = 0; i < dev_info->dev->data->nb_rx_queues; i++) + dev_info->intr_queue[i] = i; + } else { + if (!rxa_intr_queue(dev_info, rx_queue_id)) + dev_info->intr_queue[nb_rx_intr - 1] = + rx_queue_id; + } + } + + + + rxa_add_queue(rx_adapter, dev_info, rx_queue_id, queue_conf); + rxa_calc_wrr_sequence(rx_adapter, rx_poll, rx_wrr); + + rte_free(rx_adapter->eth_rx_poll); + rte_free(rx_adapter->wrr_sched); + + rx_adapter->eth_rx_poll = rx_poll; + rx_adapter->wrr_sched = rx_wrr; + rx_adapter->wrr_len = nb_wrr; + rx_adapter->num_intr_vec += num_intr_vec; + return 0; + +err_free_rxqueue: + if (rx_queue == NULL) { + rte_free(dev_info->rx_queue); + dev_info->rx_queue = NULL; + } + + rte_free(rx_poll); + rte_free(rx_wrr); + + return 0; +} + +static int +rxa_ctrl(uint8_t id, int start) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + struct rte_eventdev *dev; + struct eth_device_info *dev_info; + uint32_t i; + int use_service = 0; + int stop = !start; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[rx_adapter->eventdev_id]; + + RTE_ETH_FOREACH_DEV(i) { + dev_info = &rx_adapter->eth_devices[i]; + /* if start check for num dev queues */ + if (start && !dev_info->nb_dev_queues) + continue; + /* if stop check if dev has been started */ + if (stop && !dev_info->dev_rx_started) + continue; + use_service |= !dev_info->internal_event_port; + dev_info->dev_rx_started = start; + if (dev_info->internal_event_port == 0) + continue; + start ? (*dev->dev_ops->eth_rx_adapter_start)(dev, + &rte_eth_devices[i]) : + (*dev->dev_ops->eth_rx_adapter_stop)(dev, + &rte_eth_devices[i]); + } + + if (use_service) { + rte_spinlock_lock(&rx_adapter->rx_lock); + rx_adapter->rxa_started = start; + rte_service_runstate_set(rx_adapter->service_id, start); + rte_spinlock_unlock(&rx_adapter->rx_lock); + } + + return 0; +} + +int +rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, + rte_event_eth_rx_adapter_conf_cb conf_cb, + void *conf_arg) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + int ret; + int socket_id; + uint16_t i; + char mem_name[ETH_RX_ADAPTER_SERVICE_NAME_LEN]; + const uint8_t default_rss_key[] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, + }; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + if (conf_cb == NULL) + return -EINVAL; + + if (event_eth_rx_adapter == NULL) { + ret = rte_event_eth_rx_adapter_init(); + if (ret) + return ret; + } + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter != NULL) { + RTE_EDEV_LOG_ERR("Eth Rx adapter exists id = %" PRIu8, id); + return -EEXIST; + } + + socket_id = rte_event_dev_socket_id(dev_id); + snprintf(mem_name, ETH_RX_ADAPTER_MEM_NAME_LEN, + "rte_event_eth_rx_adapter_%d", + id); + + rx_adapter = rte_zmalloc_socket(mem_name, sizeof(*rx_adapter), + RTE_CACHE_LINE_SIZE, socket_id); + if (rx_adapter == NULL) { + RTE_EDEV_LOG_ERR("failed to get mem for rx adapter"); + return -ENOMEM; + } + + rx_adapter->eventdev_id = dev_id; + rx_adapter->socket_id = socket_id; + rx_adapter->conf_cb = conf_cb; + rx_adapter->conf_arg = conf_arg; + rx_adapter->id = id; + strcpy(rx_adapter->mem_name, mem_name); + rx_adapter->eth_devices = rte_zmalloc_socket(rx_adapter->mem_name, + /* FIXME: incompatible with hotplug */ + rte_eth_dev_count_total() * + sizeof(struct eth_device_info), 0, + socket_id); + rte_convert_rss_key((const uint32_t *)default_rss_key, + (uint32_t *)rx_adapter->rss_key_be, + RTE_DIM(default_rss_key)); + + if (rx_adapter->eth_devices == NULL) { + RTE_EDEV_LOG_ERR("failed to get mem for eth devices\n"); + rte_free(rx_adapter); + return -ENOMEM; + } + rte_spinlock_init(&rx_adapter->rx_lock); + RTE_ETH_FOREACH_DEV(i) + rx_adapter->eth_devices[i].dev = &rte_eth_devices[i]; + + event_eth_rx_adapter[id] = rx_adapter; + if (conf_cb == rxa_default_conf_cb) + rx_adapter->default_cb_arg = 1; + return 0; +} + +int +rte_event_eth_rx_adapter_create(uint8_t id, uint8_t dev_id, + struct rte_event_port_conf *port_config) +{ + struct rte_event_port_conf *pc; + int ret; + + if (port_config == NULL) + return -EINVAL; + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + pc = rte_malloc(NULL, sizeof(*pc), 0); + if (pc == NULL) + return -ENOMEM; + *pc = *port_config; + ret = rte_event_eth_rx_adapter_create_ext(id, dev_id, + rxa_default_conf_cb, + pc); + if (ret) + rte_free(pc); + return ret; +} + +int +rte_event_eth_rx_adapter_free(uint8_t id) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL) + return -EINVAL; + + if (rx_adapter->nb_queues) { + RTE_EDEV_LOG_ERR("%" PRIu16 " Rx queues not deleted", + rx_adapter->nb_queues); + return -EBUSY; + } + + if (rx_adapter->default_cb_arg) + rte_free(rx_adapter->conf_arg); + rte_free(rx_adapter->eth_devices); + rte_free(rx_adapter); + event_eth_rx_adapter[id] = NULL; + + return 0; +} + +int +rte_event_eth_rx_adapter_queue_add(uint8_t id, + uint16_t eth_dev_id, + int32_t rx_queue_id, + const struct rte_event_eth_rx_adapter_queue_conf *queue_conf) +{ + int ret; + uint32_t cap; + struct rte_event_eth_rx_adapter *rx_adapter; + struct rte_eventdev *dev; + struct eth_device_info *dev_info; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if ((rx_adapter == NULL) || (queue_conf == NULL)) + return -EINVAL; + + dev = &rte_eventdevs[rx_adapter->eventdev_id]; + ret = rte_event_eth_rx_adapter_caps_get(rx_adapter->eventdev_id, + eth_dev_id, + &cap); + if (ret) { + RTE_EDEV_LOG_ERR("Failed to get adapter caps edev %" PRIu8 + "eth port %" PRIu16, id, eth_dev_id); + return ret; + } + + if ((cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID) == 0 + && (queue_conf->rx_queue_flags & + RTE_EVENT_ETH_RX_ADAPTER_QUEUE_FLOW_ID_VALID)) { + RTE_EDEV_LOG_ERR("Flow ID override is not supported," + " eth port: %" PRIu16 " adapter id: %" PRIu8, + eth_dev_id, id); + return -EINVAL; + } + + if ((cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ) == 0 && + (rx_queue_id != -1)) { + RTE_EDEV_LOG_ERR("Rx queues can only be connected to single " + "event queue, eth port: %" PRIu16 " adapter id: %" + PRIu8, eth_dev_id, id); + return -EINVAL; + } + + if (rx_queue_id != -1 && (uint16_t)rx_queue_id >= + rte_eth_devices[eth_dev_id].data->nb_rx_queues) { + RTE_EDEV_LOG_ERR("Invalid rx queue_id %" PRIu16, + (uint16_t)rx_queue_id); + return -EINVAL; + } + + dev_info = &rx_adapter->eth_devices[eth_dev_id]; + + if (cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->eth_rx_adapter_queue_add, + -ENOTSUP); + if (dev_info->rx_queue == NULL) { + dev_info->rx_queue = + rte_zmalloc_socket(rx_adapter->mem_name, + dev_info->dev->data->nb_rx_queues * + sizeof(struct eth_rx_queue_info), 0, + rx_adapter->socket_id); + if (dev_info->rx_queue == NULL) + return -ENOMEM; + } + + ret = (*dev->dev_ops->eth_rx_adapter_queue_add)(dev, + &rte_eth_devices[eth_dev_id], + rx_queue_id, queue_conf); + if (ret == 0) { + dev_info->internal_event_port = 1; + rxa_update_queue(rx_adapter, + &rx_adapter->eth_devices[eth_dev_id], + rx_queue_id, + 1); + } + } else { + rte_spinlock_lock(&rx_adapter->rx_lock); + dev_info->internal_event_port = 0; + ret = rxa_init_service(rx_adapter, id); + if (ret == 0) { + uint32_t service_id = rx_adapter->service_id; + ret = rxa_sw_add(rx_adapter, eth_dev_id, rx_queue_id, + queue_conf); + rte_service_component_runstate_set(service_id, + rxa_sw_adapter_queue_count(rx_adapter)); + } + rte_spinlock_unlock(&rx_adapter->rx_lock); + } + + if (ret) + return ret; + + return 0; +} + +int +rte_event_eth_rx_adapter_queue_del(uint8_t id, uint16_t eth_dev_id, + int32_t rx_queue_id) +{ + int ret = 0; + struct rte_eventdev *dev; + struct rte_event_eth_rx_adapter *rx_adapter; + struct eth_device_info *dev_info; + uint32_t cap; + uint32_t nb_rx_poll = 0; + uint32_t nb_wrr = 0; + uint32_t nb_rx_intr; + struct eth_rx_poll_entry *rx_poll = NULL; + uint32_t *rx_wrr = NULL; + int num_intr_vec; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[rx_adapter->eventdev_id]; + ret = rte_event_eth_rx_adapter_caps_get(rx_adapter->eventdev_id, + eth_dev_id, + &cap); + if (ret) + return ret; + + if (rx_queue_id != -1 && (uint16_t)rx_queue_id >= + rte_eth_devices[eth_dev_id].data->nb_rx_queues) { + RTE_EDEV_LOG_ERR("Invalid rx queue_id %" PRIu16, + (uint16_t)rx_queue_id); + return -EINVAL; + } + + dev_info = &rx_adapter->eth_devices[eth_dev_id]; + + if (cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->eth_rx_adapter_queue_del, + -ENOTSUP); + ret = (*dev->dev_ops->eth_rx_adapter_queue_del)(dev, + &rte_eth_devices[eth_dev_id], + rx_queue_id); + if (ret == 0) { + rxa_update_queue(rx_adapter, + &rx_adapter->eth_devices[eth_dev_id], + rx_queue_id, + 0); + if (dev_info->nb_dev_queues == 0) { + rte_free(dev_info->rx_queue); + dev_info->rx_queue = NULL; + } + } + } else { + rxa_calc_nb_post_del(rx_adapter, dev_info, rx_queue_id, + &nb_rx_poll, &nb_rx_intr, &nb_wrr); + + ret = rxa_alloc_poll_arrays(rx_adapter, nb_rx_poll, nb_wrr, + &rx_poll, &rx_wrr); + if (ret) + return ret; + + rte_spinlock_lock(&rx_adapter->rx_lock); + + num_intr_vec = 0; + if (rx_adapter->num_rx_intr > nb_rx_intr) { + + num_intr_vec = rxa_nb_intr_vect(dev_info, + rx_queue_id, 0); + ret = rxa_del_intr_queue(rx_adapter, dev_info, + rx_queue_id); + if (ret) + goto unlock_ret; + } + + if (nb_rx_intr == 0) { + ret = rxa_free_intr_resources(rx_adapter); + if (ret) + goto unlock_ret; + } + + rxa_sw_del(rx_adapter, dev_info, rx_queue_id); + rxa_calc_wrr_sequence(rx_adapter, rx_poll, rx_wrr); + + rte_free(rx_adapter->eth_rx_poll); + rte_free(rx_adapter->wrr_sched); + + if (nb_rx_intr == 0) { + rte_free(dev_info->intr_queue); + dev_info->intr_queue = NULL; + } + + rx_adapter->eth_rx_poll = rx_poll; + rx_adapter->wrr_sched = rx_wrr; + rx_adapter->wrr_len = nb_wrr; + rx_adapter->num_intr_vec += num_intr_vec; + + if (dev_info->nb_dev_queues == 0) { + rte_free(dev_info->rx_queue); + dev_info->rx_queue = NULL; + } +unlock_ret: + rte_spinlock_unlock(&rx_adapter->rx_lock); + if (ret) { + rte_free(rx_poll); + rte_free(rx_wrr); + return ret; + } + + rte_service_component_runstate_set(rx_adapter->service_id, + rxa_sw_adapter_queue_count(rx_adapter)); + } + + return ret; +} + +int +rte_event_eth_rx_adapter_start(uint8_t id) +{ + return rxa_ctrl(id, 1); +} + +int +rte_event_eth_rx_adapter_stop(uint8_t id) +{ + return rxa_ctrl(id, 0); +} + +int +rte_event_eth_rx_adapter_stats_get(uint8_t id, + struct rte_event_eth_rx_adapter_stats *stats) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + struct rte_event_eth_rx_adapter_stats dev_stats_sum = { 0 }; + struct rte_event_eth_rx_adapter_stats dev_stats; + struct rte_eventdev *dev; + struct eth_device_info *dev_info; + uint32_t i; + int ret; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL || stats == NULL) + return -EINVAL; + + dev = &rte_eventdevs[rx_adapter->eventdev_id]; + memset(stats, 0, sizeof(*stats)); + RTE_ETH_FOREACH_DEV(i) { + dev_info = &rx_adapter->eth_devices[i]; + if (dev_info->internal_event_port == 0 || + dev->dev_ops->eth_rx_adapter_stats_get == NULL) + continue; + ret = (*dev->dev_ops->eth_rx_adapter_stats_get)(dev, + &rte_eth_devices[i], + &dev_stats); + if (ret) + continue; + dev_stats_sum.rx_packets += dev_stats.rx_packets; + dev_stats_sum.rx_enq_count += dev_stats.rx_enq_count; + } + + if (rx_adapter->service_inited) + *stats = rx_adapter->stats; + + stats->rx_packets += dev_stats_sum.rx_packets; + stats->rx_enq_count += dev_stats_sum.rx_enq_count; + return 0; +} + +int +rte_event_eth_rx_adapter_stats_reset(uint8_t id) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + struct rte_eventdev *dev; + struct eth_device_info *dev_info; + uint32_t i; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL) + return -EINVAL; + + dev = &rte_eventdevs[rx_adapter->eventdev_id]; + RTE_ETH_FOREACH_DEV(i) { + dev_info = &rx_adapter->eth_devices[i]; + if (dev_info->internal_event_port == 0 || + dev->dev_ops->eth_rx_adapter_stats_reset == NULL) + continue; + (*dev->dev_ops->eth_rx_adapter_stats_reset)(dev, + &rte_eth_devices[i]); + } + + memset(&rx_adapter->stats, 0, sizeof(rx_adapter->stats)); + return 0; +} + +int +rte_event_eth_rx_adapter_service_id_get(uint8_t id, uint32_t *service_id) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL || service_id == NULL) + return -EINVAL; + + if (rx_adapter->service_inited) + *service_id = rx_adapter->service_id; + + return rx_adapter->service_inited ? 0 : -ESRCH; +} + +int rte_event_eth_rx_adapter_cb_register(uint8_t id, + uint16_t eth_dev_id, + rte_event_eth_rx_adapter_cb_fn cb_fn, + void *cb_arg) +{ + struct rte_event_eth_rx_adapter *rx_adapter; + struct eth_device_info *dev_info; + uint32_t cap; + int ret; + + RTE_EVENT_ETH_RX_ADAPTER_ID_VALID_OR_ERR_RET(id, -EINVAL); + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_dev_id, -EINVAL); + + rx_adapter = rxa_id_to_adapter(id); + if (rx_adapter == NULL) + return -EINVAL; + + dev_info = &rx_adapter->eth_devices[eth_dev_id]; + if (dev_info->rx_queue == NULL) + return -EINVAL; + + ret = rte_event_eth_rx_adapter_caps_get(rx_adapter->eventdev_id, + eth_dev_id, + &cap); + if (ret) { + RTE_EDEV_LOG_ERR("Failed to get adapter caps edev %" PRIu8 + "eth port %" PRIu16, id, eth_dev_id); + return ret; + } + + if (cap & RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT) { + RTE_EDEV_LOG_ERR("Rx callback not supported for eth port %" + PRIu16, eth_dev_id); + return -EINVAL; + } + + rte_spinlock_lock(&rx_adapter->rx_lock); + dev_info->cb_fn = cb_fn; + dev_info->cb_arg = cb_arg; + rte_spinlock_unlock(&rx_adapter->rx_lock); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.h b/src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.h new file mode 100644 index 00000000..332ee216 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_eth_rx_adapter.h @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation. + * All rights reserved. + */ + +#ifndef _RTE_EVENT_ETH_RX_ADAPTER_ +#define _RTE_EVENT_ETH_RX_ADAPTER_ + +/** + * @file + * + * RTE Event Ethernet Rx Adapter + * + * An eventdev-based packet processing application enqueues/dequeues mbufs + * to/from the event device. Packet flow from the ethernet device to the event + * device can be accomplished using either HW or SW mechanisms depending on the + * platform and the particular combination of ethernet and event devices. The + * event ethernet Rx adapter provides common APIs to configure the packet flow + * from the ethernet devices to event devices across both these transfer + * mechanisms. + * + * The adapter uses a EAL service core function for SW based packet transfer + * and uses the eventdev PMD functions to configure HW based packet transfer + * between the ethernet device and the event device. For SW based packet + * transfer, if the mbuf does not have a timestamp set, the adapter adds a + * timestamp to the mbuf using rte_get_tsc_cycles(), this provides a more + * accurate timestamp as compared to if the application were to set the time + * stamp since it avoids event device schedule latency. + * + * The ethernet Rx event adapter's functions are: + * - rte_event_eth_rx_adapter_create_ext() + * - rte_event_eth_rx_adapter_create() + * - rte_event_eth_rx_adapter_free() + * - rte_event_eth_rx_adapter_queue_add() + * - rte_event_eth_rx_adapter_queue_del() + * - rte_event_eth_rx_adapter_start() + * - rte_event_eth_rx_adapter_stop() + * - rte_event_eth_rx_adapter_stats_get() + * - rte_event_eth_rx_adapter_stats_reset() + * + * The application creates an ethernet to event adapter using + * rte_event_eth_rx_adapter_create_ext() or rte_event_eth_rx_adapter_create() + * functions. + * The adapter needs to know which ethernet rx queues to poll for mbufs as well + * as event device parameters such as the event queue identifier, event + * priority and scheduling type that the adapter should use when constructing + * events. The rte_event_eth_rx_adapter_queue_add() function is provided for + * this purpose. + * The servicing weight parameter in the rte_event_eth_rx_adapter_queue_conf + * is applicable when the Rx adapter uses a service core function and is + * intended to provide application control of the frequency of polling ethernet + * device receive queues, for example, the application may want to poll higher + * priority queues with a higher frequency but at the same time not starve + * lower priority queues completely. If this parameter is zero and the receive + * interrupt is enabled when configuring the device, the receive queue is + * interrupt driven; else, the queue is assigned a servicing weight of one. + * + * The application can start/stop the adapter using the + * rte_event_eth_rx_adapter_start() and the rte_event_eth_rx_adapter_stop() + * functions. If the adapter uses a rte_service function, then the application + * is also required to assign a core to the service function and control the + * service core using the rte_service APIs. The + * rte_event_eth_rx_adapter_service_id_get() function can be used to retrieve + * the service function ID of the adapter in this case. + * + * For SW based packet transfers, i.e., when the + * RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT is not set in the adapter's + * capabilities flags for a particular ethernet device, the service function + * temporarily enqueues mbufs to an event buffer before batch enqueueing these + * to the event device. If the buffer fills up, the service function stops + * dequeueing packets from the ethernet device. The application may want to + * monitor the buffer fill level and instruct the service function to + * selectively buffer packets. The application may also use some other + * criteria to decide which packets should enter the event device even when + * the event buffer fill level is low. The + * rte_event_eth_rx_adapter_cb_register() function allows the + * application to register a callback that selects which packets to enqueue + * to the event device. + * + * Note: + * 1) Devices created after an instance of rte_event_eth_rx_adapter_create + * should be added to a new instance of the rx adapter. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include + +#include "rte_eventdev.h" + +#define RTE_EVENT_ETH_RX_ADAPTER_MAX_INSTANCE 32 + +/* struct rte_event_eth_rx_adapter_queue_conf flags definitions */ +#define RTE_EVENT_ETH_RX_ADAPTER_QUEUE_FLOW_ID_VALID 0x1 +/**< This flag indicates the flow identifier is valid + * @see rte_event_eth_rx_adapter_queue_conf::rx_queue_flags + */ + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Adapter configuration structure that the adapter configuration callback + * function is expected to fill out + * @see rte_event_eth_rx_adapter_conf_cb + */ +struct rte_event_eth_rx_adapter_conf { + uint8_t event_port_id; + /**< Event port identifier, the adapter enqueues mbuf events to this + * port. + */ + uint32_t max_nb_rx; + /**< The adapter can return early if it has processed at least + * max_nb_rx mbufs. This isn't treated as a requirement; batching may + * cause the adapter to process more than max_nb_rx mbufs. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Function type used for adapter configuration callback. The callback is + * used to fill in members of the struct rte_event_eth_rx_adapter_conf, this + * callback is invoked when creating a SW service for packet transfer from + * ethdev queues to the event device. The SW service is created within the + * rte_event_eth_rx_adapter_queue_add() function if SW based packet transfers + * from ethdev queues to the event device are required. + * + * @param id + * Adapter identifier. + * + * @param dev_id + * Event device identifier. + * + * @param [out] conf + * Structure that needs to be populated by this callback. + * + * @param arg + * Argument to the callback. This is the same as the conf_arg passed to the + * rte_event_eth_rx_adapter_create_ext(). + */ +typedef int (*rte_event_eth_rx_adapter_conf_cb) (uint8_t id, uint8_t dev_id, + struct rte_event_eth_rx_adapter_conf *conf, + void *arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Rx queue configuration structure + */ +struct rte_event_eth_rx_adapter_queue_conf { + uint32_t rx_queue_flags; + /**< Flags for handling received packets + * @see RTE_EVENT_ETH_RX_ADAPTER_QUEUE_FLOW_ID_VALID + */ + uint16_t servicing_weight; + /**< Relative polling frequency of ethernet receive queue when the + * adapter uses a service core function for ethernet to event device + * transfers. If it is set to zero, the Rx queue is interrupt driven + * (unless rx queue interrupts are not enabled for the ethernet + * device). + */ + struct rte_event ev; + /**< + * The values from the following event fields will be used when + * queuing mbuf events: + * - event_queue_id: Targeted event queue ID for received packets. + * - event_priority: Event priority of packets from this Rx queue in + * the event queue relative to other events. + * - sched_type: Scheduling type for packets from this Rx queue. + * - flow_id: If the RTE_ETH_RX_EVENT_ADAPTER_QUEUE_FLOW_ID_VALID bit + * is set in rx_queue_flags, this flow_id is used for all + * packets received from this queue. Otherwise the flow ID + * is set to the RSS hash of the src and dst IPv4/6 + * addresses. + * + * The event adapter sets ev.event_type to RTE_EVENT_TYPE_ETHDEV in the + * enqueued event. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * A structure used to retrieve statistics for an eth rx adapter instance. + */ +struct rte_event_eth_rx_adapter_stats { + uint64_t rx_poll_count; + /**< Receive queue poll count */ + uint64_t rx_packets; + /**< Received packet count */ + uint64_t rx_enq_count; + /**< Eventdev enqueue count */ + uint64_t rx_enq_retry; + /**< Eventdev enqueue retry count */ + uint64_t rx_enq_start_ts; + /**< Rx enqueue start timestamp */ + uint64_t rx_enq_block_cycles; + /**< Cycles for which the service is blocked by the event device, + * i.e, the service fails to enqueue to the event device. + */ + uint64_t rx_enq_end_ts; + /**< Latest timestamp at which the service is unblocked + * by the event device. The start, end timestamps and + * block cycles can be used to compute the percentage of + * cycles the service is blocked by the event device. + */ + uint64_t rx_intr_packets; + /**< Received packet count for interrupt mode Rx queues */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Callback function invoked by the SW adapter before it continues + * to process packets. The callback is passed the size of the enqueue + * buffer in the SW adapter and the occupancy of the buffer. The + * callback can use these values to decide which mbufs should be + * enqueued to the event device. If the return value of the callback + * is less than nb_mbuf then the SW adapter uses the return value to + * enqueue enq_mbuf[] to the event device. + * + * @param eth_dev_id + * Port identifier of the Ethernet device. + * @param queue_id + * Receive queue index. + * @param enqueue_buf_size + * Total enqueue buffer size. + * @param enqueue_buf_count + * mbuf count in enqueue buffer. + * @param mbuf + * mbuf array. + * @param nb_mbuf + * mbuf count. + * @param cb_arg + * Callback argument. + * @param[out] enq_mbuf + * The adapter enqueues enq_mbuf[] if the return value of the + * callback is less than nb_mbuf + * @return + * Returns the number of mbufs should be enqueued to eventdev + */ +typedef uint16_t (*rte_event_eth_rx_adapter_cb_fn)(uint16_t eth_dev_id, + uint16_t queue_id, + uint32_t enqueue_buf_size, + uint32_t enqueue_buf_count, + struct rte_mbuf **mbuf, + uint16_t nb_mbuf, + void *cb_arg, + struct rte_mbuf **enq_buf); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a new ethernet Rx event adapter with the specified identifier. + * + * @param id + * The identifier of the ethernet Rx event adapter. + * + * @param dev_id + * The identifier of the device to configure. + * + * @param conf_cb + * Callback function that fills in members of a + * struct rte_event_eth_rx_adapter_conf struct passed into + * it. + * + * @param conf_arg + * Argument that is passed to the conf_cb function. + * + * @return + * - 0: Success + * - <0: Error code on failure + */ +int rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id, + rte_event_eth_rx_adapter_conf_cb conf_cb, + void *conf_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a new ethernet Rx event adapter with the specified identifier. + * This function uses an internal configuration function that creates an event + * port. This default function reconfigures the event device with an + * additional event port and setups up the event port using the port_config + * parameter passed into this function. In case the application needs more + * control in configuration of the service, it should use the + * rte_event_eth_rx_adapter_create_ext() version. + * + * @param id + * The identifier of the ethernet Rx event adapter. + * + * @param dev_id + * The identifier of the device to configure. + * + * @param port_config + * Argument of type *rte_event_port_conf* that is passed to the conf_cb + * function. + * + * @return + * - 0: Success + * - <0: Error code on failure + */ +int rte_event_eth_rx_adapter_create(uint8_t id, uint8_t dev_id, + struct rte_event_port_conf *port_config); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Free an event adapter + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success + * - <0: Error code on failure, If the adapter still has Rx queues + * added to it, the function returns -EBUSY. + */ +int rte_event_eth_rx_adapter_free(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Add receive queue to an event adapter. After a queue has been + * added to the event adapter, the result of the application calling + * rte_eth_rx_burst(eth_dev_id, rx_queue_id, ..) is undefined. + * + * @param id + * Adapter identifier. + * + * @param eth_dev_id + * Port identifier of Ethernet device. + * + * @param rx_queue_id + * Ethernet device receive queue index. + * If rx_queue_id is -1, then all Rx queues configured for + * the device are added. If the ethdev Rx queues can only be + * connected to a single event queue then rx_queue_id is + * required to be -1. + * @see RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ + * + * @param conf + * Additional configuration structure of type *rte_event_eth_rx_adapter_conf* + * + * @return + * - 0: Success, Receive queue added correctly. + * - <0: Error code on failure. + * - (-EIO) device reconfiguration and restart error. The adapter reconfigures + * the event device with an additional port if it is required to use a service + * function for packet transfer from the ethernet device to the event device. + * If the device had been started before this call, this error code indicates + * an error in restart following an error in reconfiguration, i.e., a + * combination of the two error codes. + */ +int rte_event_eth_rx_adapter_queue_add(uint8_t id, + uint16_t eth_dev_id, + int32_t rx_queue_id, + const struct rte_event_eth_rx_adapter_queue_conf *conf); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Delete receive queue from an event adapter. + * + * @param id + * Adapter identifier. + * + * @param eth_dev_id + * Port identifier of Ethernet device. + * + * @param rx_queue_id + * Ethernet device receive queue index. + * If rx_queue_id is -1, then all Rx queues configured for + * the device are deleted. If the ethdev Rx queues can only be + * connected to a single event queue then rx_queue_id is + * required to be -1. + * @see RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ + * + * @return + * - 0: Success, Receive queue deleted correctly. + * - <0: Error code on failure. + */ +int rte_event_eth_rx_adapter_queue_del(uint8_t id, uint16_t eth_dev_id, + int32_t rx_queue_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start ethernet Rx event adapter + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success, Adapter started correctly. + * - <0: Error code on failure. + */ +int rte_event_eth_rx_adapter_start(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop ethernet Rx event adapter + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success, Adapter started correctly. + * - <0: Error code on failure. + */ +int rte_event_eth_rx_adapter_stop(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve statistics for an adapter + * + * @param id + * Adapter identifier. + * + * @param [out] stats + * A pointer to structure used to retrieve statistics for an adapter. + * + * @return + * - 0: Success, retrieved successfully. + * - <0: Error code on failure. + */ +int rte_event_eth_rx_adapter_stats_get(uint8_t id, + struct rte_event_eth_rx_adapter_stats *stats); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset statistics for an adapter. + * + * @param id + * Adapter identifier. + * + * @return + * - 0: Success, statistics reset successfully. + * - <0: Error code on failure. + */ +int rte_event_eth_rx_adapter_stats_reset(uint8_t id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the service ID of an adapter. If the adapter doesn't use + * a rte_service function, this function returns -ESRCH. + * + * @param id + * Adapter identifier. + * + * @param [out] service_id + * A pointer to a uint32_t, to be filled in with the service id. + * + * @return + * - 0: Success + * - <0: Error code on failure, if the adapter doesn't use a rte_service + * function, this function returns -ESRCH. + */ +int rte_event_eth_rx_adapter_service_id_get(uint8_t id, uint32_t *service_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register callback to process Rx packets, this is supported for + * SW based packet transfers. + * @see rte_event_eth_rx_cb_fn + * + * @param id + * Adapter identifier. + * @param eth_dev_id + * Port identifier of Ethernet device. + * @param cb_fn + * Callback function. + * @param cb_arg + * Callback arg. + * @return + * - 0: Success + * - <0: Error code on failure. + */ +int __rte_experimental +rte_event_eth_rx_adapter_cb_register(uint8_t id, + uint16_t eth_dev_id, + rte_event_eth_rx_adapter_cb_fn cb_fn, + void *cb_arg); + +#ifdef __cplusplus +} +#endif +#endif /* _RTE_EVENT_ETH_RX_ADAPTER_ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.c b/src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.c new file mode 100644 index 00000000..16d02a95 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.c @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include "rte_event_ring.h" + +TAILQ_HEAD(rte_event_ring_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_event_ring_tailq = { + .name = RTE_TAILQ_EVENT_RING_NAME, +}; +EAL_REGISTER_TAILQ(rte_event_ring_tailq) + +int +rte_event_ring_init(struct rte_event_ring *r, const char *name, + unsigned int count, unsigned int flags) +{ + /* compilation-time checks */ + RTE_BUILD_BUG_ON((sizeof(struct rte_event_ring) & + RTE_CACHE_LINE_MASK) != 0); + + /* init the ring structure */ + return rte_ring_init(&r->r, name, count, flags); +} + +/* create the ring */ +struct rte_event_ring * +rte_event_ring_create(const char *name, unsigned int count, int socket_id, + unsigned int flags) +{ + char mz_name[RTE_MEMZONE_NAMESIZE]; + struct rte_event_ring *r; + struct rte_tailq_entry *te; + const struct rte_memzone *mz; + ssize_t ring_size; + int mz_flags = 0; + struct rte_event_ring_list *ring_list = NULL; + const unsigned int requested_count = count; + int ret; + + ring_list = RTE_TAILQ_CAST(rte_event_ring_tailq.head, + rte_event_ring_list); + + /* for an exact size ring, round up from count to a power of two */ + if (flags & RING_F_EXACT_SZ) + count = rte_align32pow2(count + 1); + else if (!rte_is_power_of_2(count)) { + rte_errno = EINVAL; + return NULL; + } + + ring_size = sizeof(*r) + (count * sizeof(struct rte_event)); + + ret = snprintf(mz_name, sizeof(mz_name), "%s%s", + RTE_RING_MZ_PREFIX, name); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + rte_errno = ENAMETOOLONG; + return NULL; + } + + te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, RING, "Cannot reserve memory for tailq\n"); + rte_errno = ENOMEM; + return NULL; + } + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* + * reserve a memory zone for this ring. If we can't get rte_config or + * we are secondary process, the memzone_reserve function will set + * rte_errno for us appropriately - hence no check in this this function + */ + mz = rte_memzone_reserve(mz_name, ring_size, socket_id, mz_flags); + if (mz != NULL) { + r = mz->addr; + /* Check return value in case rte_ring_init() fails on size */ + int err = rte_event_ring_init(r, name, requested_count, flags); + if (err) { + RTE_LOG(ERR, RING, "Ring init failed\n"); + if (rte_memzone_free(mz) != 0) + RTE_LOG(ERR, RING, "Cannot free memzone\n"); + rte_free(te); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return NULL; + } + + te->data = (void *) r; + r->r.memzone = mz; + + TAILQ_INSERT_TAIL(ring_list, te, next); + } else { + r = NULL; + RTE_LOG(ERR, RING, "Cannot reserve memory\n"); + rte_free(te); + } + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return r; +} + + +struct rte_event_ring * +rte_event_ring_lookup(const char *name) +{ + struct rte_tailq_entry *te; + struct rte_event_ring *r = NULL; + struct rte_event_ring_list *ring_list; + + ring_list = RTE_TAILQ_CAST(rte_event_ring_tailq.head, + rte_event_ring_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + + TAILQ_FOREACH(te, ring_list, next) { + r = (struct rte_event_ring *) te->data; + if (strncmp(name, r->r.name, RTE_RING_NAMESIZE) == 0) + break; + } + + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return r; +} + +/* free the ring */ +void +rte_event_ring_free(struct rte_event_ring *r) +{ + struct rte_event_ring_list *ring_list = NULL; + struct rte_tailq_entry *te; + + if (r == NULL) + return; + + /* + * Ring was not created with rte_event_ring_create, + * therefore, there is no memzone to free. + */ + if (r->r.memzone == NULL) { + RTE_LOG(ERR, RING, + "Cannot free ring (not created with rte_event_ring_create()"); + return; + } + + if (rte_memzone_free(r->r.memzone) != 0) { + RTE_LOG(ERR, RING, "Cannot free memory\n"); + return; + } + + ring_list = RTE_TAILQ_CAST(rte_event_ring_tailq.head, + rte_event_ring_list); + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find out tailq entry */ + TAILQ_FOREACH(te, ring_list, next) { + if (te->data == (void *) r) + break; + } + + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(ring_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(te); +} diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.h b/src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.h new file mode 100644 index 00000000..827a3209 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_ring.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Intel Corporation + */ + +/** + * @file + * RTE Event Ring + * + * This provides a ring implementation for passing rte_event structures + * from one core to another. + */ + +#ifndef _RTE_EVENT_RING_ +#define _RTE_EVENT_RING_ + +#include + +#include +#include +#include +#include +#include "rte_eventdev.h" + +#define RTE_TAILQ_EVENT_RING_NAME "RTE_EVENT_RING" + +/** + * Generic ring structure for passing rte_event objects from core to core. + * + * Based on the primitives given in the rte_ring library. Designed to be + * used inside software eventdev implementations and by applications + * directly as needed. + */ +struct rte_event_ring { + struct rte_ring r; +}; + +/** + * Returns the number of events in the ring + * + * @param r + * pointer to the event ring + * @return + * the number of events in the ring + */ +static __rte_always_inline unsigned int +rte_event_ring_count(const struct rte_event_ring *r) +{ + return rte_ring_count(&r->r); +} + +/** + * Returns the amount of free space in the ring + * + * @param r + * pointer to the event ring + * @return + * the number of free slots in the ring, i.e. the number of events that + * can be successfully enqueued before dequeue must be called + */ +static __rte_always_inline unsigned int +rte_event_ring_free_count(const struct rte_event_ring *r) +{ + return rte_ring_free_count(&r->r); +} + +/** + * Enqueue a set of events onto a ring + * + * Note: this API enqueues by copying the events themselves onto the ring, + * rather than just placing a pointer to each event onto the ring. This + * means that statically-allocated events can safely be enqueued by this + * API. + * + * @param r + * pointer to the event ring + * @param events + * pointer to an array of struct rte_event objects + * @param n + * number of events in the array to enqueue + * @param free_space + * if non-null, is updated to indicate the amount of free space in the + * ring once the enqueue has completed. + * @return + * the number of elements, n', enqueued to the ring, 0 <= n' <= n + */ +static __rte_always_inline unsigned int +rte_event_ring_enqueue_burst(struct rte_event_ring *r, + const struct rte_event *events, + unsigned int n, uint16_t *free_space) +{ + uint32_t prod_head, prod_next; + uint32_t free_entries; + + n = __rte_ring_move_prod_head(&r->r, r->r.prod.single, n, + RTE_RING_QUEUE_VARIABLE, + &prod_head, &prod_next, &free_entries); + if (n == 0) + goto end; + + ENQUEUE_PTRS(&r->r, &r[1], prod_head, events, n, struct rte_event); + + update_tail(&r->r.prod, prod_head, prod_next, r->r.prod.single, 1); +end: + if (free_space != NULL) + *free_space = free_entries - n; + return n; +} + +/** + * Dequeue a set of events from a ring + * + * Note: this API does not work with pointers to events, rather it copies + * the events themselves to the destination ``events`` buffer. + * + * @param r + * pointer to the event ring + * @param events + * pointer to an array to hold the struct rte_event objects + * @param n + * number of events that can be held in the ``events`` array + * @param available + * if non-null, is updated to indicate the number of events remaining in + * the ring once the dequeue has completed + * @return + * the number of elements, n', dequeued from the ring, 0 <= n' <= n + */ +static __rte_always_inline unsigned int +rte_event_ring_dequeue_burst(struct rte_event_ring *r, + struct rte_event *events, + unsigned int n, uint16_t *available) +{ + uint32_t cons_head, cons_next; + uint32_t entries; + + n = __rte_ring_move_cons_head(&r->r, r->r.cons.single, n, + RTE_RING_QUEUE_VARIABLE, + &cons_head, &cons_next, &entries); + if (n == 0) + goto end; + + DEQUEUE_PTRS(&r->r, &r[1], cons_head, events, n, struct rte_event); + + update_tail(&r->r.cons, cons_head, cons_next, r->r.cons.single, 0); + +end: + if (available != NULL) + *available = entries - n; + return n; +} + +/* + * Initializes an already-allocated ring structure + * + * @param r + * pointer to the ring memory to be initialized + * @param name + * name to be given to the ring + * @param count + * the number of elements to be stored in the ring. If the flag + * ``RING_F_EXACT_SZ`` is not set, this must be a power of 2, and the actual + * usable space in the ring will be ``count - 1`` entries. If the flag + * ``RING_F_EXACT_SZ`` is set, the this can be any value up to the ring size + * limit - 1, and the usable space will be exactly that requested. + * @param flags + * An OR of the following: + * - RING_F_SP_ENQ: If this flag is set, the default behavior when + * using ``rte_ring_enqueue()`` or ``rte_ring_enqueue_bulk()`` + * is "single-producer". Otherwise, it is "multi-producers". + * - RING_F_SC_DEQ: If this flag is set, the default behavior when + * using ``rte_ring_dequeue()`` or ``rte_ring_dequeue_bulk()`` + * is "single-consumer". Otherwise, it is "multi-consumers". + * - RING_F_EXACT_SZ: If this flag is set, the ``count`` parameter is to + * be taken as the exact usable size of the ring, and as such does not + * need to be a power of 2. The underlying ring memory should be a + * power-of-2 size greater than the count value. + * @return + * 0 on success, or a negative value on error. + */ +int +rte_event_ring_init(struct rte_event_ring *r, const char *name, + unsigned int count, unsigned int flags); + +/* + * Create an event ring structure + * + * This function allocates memory and initializes an event ring inside that + * memory. + * + * @param name + * name to be given to the ring + * @param count + * the number of elements to be stored in the ring. If the flag + * ``RING_F_EXACT_SZ`` is not set, this must be a power of 2, and the actual + * usable space in the ring will be ``count - 1`` entries. If the flag + * ``RING_F_EXACT_SZ`` is set, the this can be any value up to the ring size + * limit - 1, and the usable space will be exactly that requested. + * @param socket_id + * The *socket_id* argument is the socket identifier in case of + * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA + * constraint for the reserved zone. + * @param flags + * An OR of the following: + * - RING_F_SP_ENQ: If this flag is set, the default behavior when + * using ``rte_ring_enqueue()`` or ``rte_ring_enqueue_bulk()`` + * is "single-producer". Otherwise, it is "multi-producers". + * - RING_F_SC_DEQ: If this flag is set, the default behavior when + * using ``rte_ring_dequeue()`` or ``rte_ring_dequeue_bulk()`` + * is "single-consumer". Otherwise, it is "multi-consumers". + * - RING_F_EXACT_SZ: If this flag is set, the ``count`` parameter is to + * be taken as the exact usable size of the ring, and as such does not + * need to be a power of 2. The underlying ring memory should be a + * power-of-2 size greater than the count value. + * @return + * On success, the pointer to the new allocated ring. NULL on error with + * rte_errno set appropriately. Possible errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - count provided is not a power of 2 + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_event_ring * +rte_event_ring_create(const char *name, unsigned int count, int socket_id, + unsigned int flags); + +/** + * Search for an event ring based on its name + * + * @param name + * The name of the ring. + * @return + * The pointer to the ring matching the name, or NULL if not found, + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_event_ring * +rte_event_ring_lookup(const char *name); + +/** + * De-allocate all memory used by the ring. + * + * @param r + * Ring to free + */ +void +rte_event_ring_free(struct rte_event_ring *r); + +/** + * Return the size of the event ring. + * + * @param r + * A pointer to the ring structure. + * @return + * The size of the data store used by the ring. + * NOTE: this is not the same as the usable space in the ring. To query that + * use ``rte_ring_get_capacity()``. + */ +static inline unsigned int +rte_event_ring_get_size(const struct rte_event_ring *r) +{ + return rte_ring_get_size(&r->r); +} + +/** + * Return the number of elements which can be stored in the event ring. + * + * @param r + * A pointer to the ring structure. + * @return + * The usable size of the ring. + */ +static inline unsigned int +rte_event_ring_get_capacity(const struct rte_event_ring *r) +{ + return rte_ring_get_capacity(&r->r); +} +#endif diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.c b/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.c new file mode 100644 index 00000000..79070d48 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.c @@ -0,0 +1,1299 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation. + * All rights reserved. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_eventdev.h" +#include "rte_eventdev_pmd.h" +#include "rte_event_timer_adapter.h" +#include "rte_event_timer_adapter_pmd.h" + +#define DATA_MZ_NAME_MAX_LEN 64 +#define DATA_MZ_NAME_FORMAT "rte_event_timer_adapter_data_%d" + +static int evtim_logtype; +static int evtim_svc_logtype; +static int evtim_buffer_logtype; + +static struct rte_event_timer_adapter adapters[RTE_EVENT_TIMER_ADAPTER_NUM_MAX]; + +static const struct rte_event_timer_adapter_ops sw_event_adapter_timer_ops; + +#define EVTIM_LOG(level, logtype, ...) \ + rte_log(RTE_LOG_ ## level, logtype, \ + RTE_FMT("EVTIMER: %s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) \ + "\n", __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,))) + +#define EVTIM_LOG_ERR(...) EVTIM_LOG(ERR, evtim_logtype, __VA_ARGS__) + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG +#define EVTIM_LOG_DBG(...) \ + EVTIM_LOG(DEBUG, evtim_logtype, __VA_ARGS__) +#define EVTIM_BUF_LOG_DBG(...) \ + EVTIM_LOG(DEBUG, evtim_buffer_logtype, __VA_ARGS__) +#define EVTIM_SVC_LOG_DBG(...) \ + EVTIM_LOG(DEBUG, evtim_svc_logtype, __VA_ARGS__) +#else +#define EVTIM_LOG_DBG(...) (void)0 +#define EVTIM_BUF_LOG_DBG(...) (void)0 +#define EVTIM_SVC_LOG_DBG(...) (void)0 +#endif + +static int +default_port_conf_cb(uint16_t id, uint8_t event_dev_id, uint8_t *event_port_id, + void *conf_arg) +{ + struct rte_event_timer_adapter *adapter; + struct rte_eventdev *dev; + struct rte_event_dev_config dev_conf; + struct rte_event_port_conf *port_conf, def_port_conf = {0}; + int started; + uint8_t port_id; + uint8_t dev_id; + int ret; + + RTE_SET_USED(event_dev_id); + + adapter = &adapters[id]; + dev = &rte_eventdevs[adapter->data->event_dev_id]; + dev_id = dev->data->dev_id; + dev_conf = dev->data->dev_conf; + + started = dev->data->dev_started; + if (started) + rte_event_dev_stop(dev_id); + + port_id = dev_conf.nb_event_ports; + dev_conf.nb_event_ports += 1; + ret = rte_event_dev_configure(dev_id, &dev_conf); + if (ret < 0) { + EVTIM_LOG_ERR("failed to configure event dev %u\n", dev_id); + if (started) + if (rte_event_dev_start(dev_id)) + return -EIO; + + return ret; + } + + if (conf_arg != NULL) + port_conf = conf_arg; + else { + port_conf = &def_port_conf; + ret = rte_event_port_default_conf_get(dev_id, port_id, + port_conf); + if (ret < 0) + return ret; + } + + ret = rte_event_port_setup(dev_id, port_id, port_conf); + if (ret < 0) { + EVTIM_LOG_ERR("failed to setup event port %u on event dev %u\n", + port_id, dev_id); + return ret; + } + + *event_port_id = port_id; + + if (started) + ret = rte_event_dev_start(dev_id); + + return ret; +} + +struct rte_event_timer_adapter * __rte_experimental +rte_event_timer_adapter_create(const struct rte_event_timer_adapter_conf *conf) +{ + return rte_event_timer_adapter_create_ext(conf, default_port_conf_cb, + NULL); +} + +struct rte_event_timer_adapter * __rte_experimental +rte_event_timer_adapter_create_ext( + const struct rte_event_timer_adapter_conf *conf, + rte_event_timer_adapter_port_conf_cb_t conf_cb, + void *conf_arg) +{ + uint16_t adapter_id; + struct rte_event_timer_adapter *adapter; + const struct rte_memzone *mz; + char mz_name[DATA_MZ_NAME_MAX_LEN]; + int n, ret; + struct rte_eventdev *dev; + + if (conf == NULL) { + rte_errno = EINVAL; + return NULL; + } + + /* Check eventdev ID */ + if (!rte_event_pmd_is_valid_dev(conf->event_dev_id)) { + rte_errno = EINVAL; + return NULL; + } + dev = &rte_eventdevs[conf->event_dev_id]; + + adapter_id = conf->timer_adapter_id; + + /* Check that adapter_id is in range */ + if (adapter_id >= RTE_EVENT_TIMER_ADAPTER_NUM_MAX) { + rte_errno = EINVAL; + return NULL; + } + + /* Check adapter ID not already allocated */ + adapter = &adapters[adapter_id]; + if (adapter->allocated) { + rte_errno = EEXIST; + return NULL; + } + + /* Create shared data area. */ + n = snprintf(mz_name, sizeof(mz_name), DATA_MZ_NAME_FORMAT, adapter_id); + if (n >= (int)sizeof(mz_name)) { + rte_errno = EINVAL; + return NULL; + } + mz = rte_memzone_reserve(mz_name, + sizeof(struct rte_event_timer_adapter_data), + conf->socket_id, 0); + if (mz == NULL) + /* rte_errno set by rte_memzone_reserve */ + return NULL; + + adapter->data = mz->addr; + memset(adapter->data, 0, sizeof(struct rte_event_timer_adapter_data)); + + adapter->data->mz = mz; + adapter->data->event_dev_id = conf->event_dev_id; + adapter->data->id = adapter_id; + adapter->data->socket_id = conf->socket_id; + adapter->data->conf = *conf; /* copy conf structure */ + + /* Query eventdev PMD for timer adapter capabilities and ops */ + ret = dev->dev_ops->timer_adapter_caps_get(dev, + adapter->data->conf.flags, + &adapter->data->caps, + &adapter->ops); + if (ret < 0) { + rte_errno = ret; + goto free_memzone; + } + + if (!(adapter->data->caps & + RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT)) { + FUNC_PTR_OR_NULL_RET_WITH_ERRNO(conf_cb, -EINVAL); + ret = conf_cb(adapter->data->id, adapter->data->event_dev_id, + &adapter->data->event_port_id, conf_arg); + if (ret < 0) { + rte_errno = ret; + goto free_memzone; + } + } + + /* If eventdev PMD did not provide ops, use default software + * implementation. + */ + if (adapter->ops == NULL) + adapter->ops = &sw_event_adapter_timer_ops; + + /* Allow driver to do some setup */ + FUNC_PTR_OR_NULL_RET_WITH_ERRNO(adapter->ops->init, -ENOTSUP); + ret = adapter->ops->init(adapter); + if (ret < 0) { + rte_errno = ret; + goto free_memzone; + } + + /* Set fast-path function pointers */ + adapter->arm_burst = adapter->ops->arm_burst; + adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst; + adapter->cancel_burst = adapter->ops->cancel_burst; + + adapter->allocated = 1; + + return adapter; + +free_memzone: + rte_memzone_free(adapter->data->mz); + return NULL; +} + +int __rte_experimental +rte_event_timer_adapter_get_info(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_info *adapter_info) +{ + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + + if (adapter->ops->get_info) + /* let driver set values it knows */ + adapter->ops->get_info(adapter, adapter_info); + + /* Set common values */ + adapter_info->conf = adapter->data->conf; + adapter_info->event_dev_port_id = adapter->data->event_port_id; + adapter_info->caps = adapter->data->caps; + + return 0; +} + +int __rte_experimental +rte_event_timer_adapter_start(const struct rte_event_timer_adapter *adapter) +{ + int ret; + + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->ops->start, -EINVAL); + + ret = adapter->ops->start(adapter); + if (ret < 0) + return ret; + + adapter->data->started = 1; + + return 0; +} + +int __rte_experimental +rte_event_timer_adapter_stop(const struct rte_event_timer_adapter *adapter) +{ + int ret; + + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->ops->stop, -EINVAL); + + if (adapter->data->started == 0) { + EVTIM_LOG_ERR("event timer adapter %"PRIu8" already stopped", + adapter->data->id); + return 0; + } + + ret = adapter->ops->stop(adapter); + if (ret < 0) + return ret; + + adapter->data->started = 0; + + return 0; +} + +struct rte_event_timer_adapter * __rte_experimental +rte_event_timer_adapter_lookup(uint16_t adapter_id) +{ + char name[DATA_MZ_NAME_MAX_LEN]; + const struct rte_memzone *mz; + struct rte_event_timer_adapter_data *data; + struct rte_event_timer_adapter *adapter; + int ret; + struct rte_eventdev *dev; + + if (adapters[adapter_id].allocated) + return &adapters[adapter_id]; /* Adapter is already loaded */ + + snprintf(name, DATA_MZ_NAME_MAX_LEN, DATA_MZ_NAME_FORMAT, adapter_id); + mz = rte_memzone_lookup(name); + if (mz == NULL) { + rte_errno = ENOENT; + return NULL; + } + + data = mz->addr; + + adapter = &adapters[data->id]; + adapter->data = data; + + dev = &rte_eventdevs[adapter->data->event_dev_id]; + + /* Query eventdev PMD for timer adapter capabilities and ops */ + ret = dev->dev_ops->timer_adapter_caps_get(dev, + adapter->data->conf.flags, + &adapter->data->caps, + &adapter->ops); + if (ret < 0) { + rte_errno = EINVAL; + return NULL; + } + + /* If eventdev PMD did not provide ops, use default software + * implementation. + */ + if (adapter->ops == NULL) + adapter->ops = &sw_event_adapter_timer_ops; + + /* Set fast-path function pointers */ + adapter->arm_burst = adapter->ops->arm_burst; + adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst; + adapter->cancel_burst = adapter->ops->cancel_burst; + + adapter->allocated = 1; + + return adapter; +} + +int __rte_experimental +rte_event_timer_adapter_free(struct rte_event_timer_adapter *adapter) +{ + int ret; + + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->ops->uninit, -EINVAL); + + if (adapter->data->started == 1) { + EVTIM_LOG_ERR("event timer adapter %"PRIu8" must be stopped " + "before freeing", adapter->data->id); + return -EBUSY; + } + + /* free impl priv data */ + ret = adapter->ops->uninit(adapter); + if (ret < 0) + return ret; + + /* free shared data area */ + ret = rte_memzone_free(adapter->data->mz); + if (ret < 0) + return ret; + + adapter->data = NULL; + adapter->allocated = 0; + + return 0; +} + +int __rte_experimental +rte_event_timer_adapter_service_id_get(struct rte_event_timer_adapter *adapter, + uint32_t *service_id) +{ + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + + if (adapter->data->service_inited && service_id != NULL) + *service_id = adapter->data->service_id; + + return adapter->data->service_inited ? 0 : -ESRCH; +} + +int __rte_experimental +rte_event_timer_adapter_stats_get(struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_stats *stats) +{ + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->ops->stats_get, -EINVAL); + if (stats == NULL) + return -EINVAL; + + return adapter->ops->stats_get(adapter, stats); +} + +int __rte_experimental +rte_event_timer_adapter_stats_reset(struct rte_event_timer_adapter *adapter) +{ + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->ops->stats_reset, -EINVAL); + return adapter->ops->stats_reset(adapter); +} + +/* + * Software event timer adapter buffer helper functions + */ + +#define NSECPERSEC 1E9 + +/* Optimizations used to index into the buffer require that the buffer size + * be a power of 2. + */ +#define EVENT_BUFFER_SZ 4096 +#define EVENT_BUFFER_BATCHSZ 32 +#define EVENT_BUFFER_MASK (EVENT_BUFFER_SZ - 1) + +struct event_buffer { + uint16_t head; + uint16_t tail; + struct rte_event events[EVENT_BUFFER_SZ]; +} __rte_cache_aligned; + +static inline bool +event_buffer_full(struct event_buffer *bufp) +{ + return (bufp->head - bufp->tail) == EVENT_BUFFER_SZ; +} + +static inline bool +event_buffer_batch_ready(struct event_buffer *bufp) +{ + return (bufp->head - bufp->tail) >= EVENT_BUFFER_BATCHSZ; +} + +static void +event_buffer_init(struct event_buffer *bufp) +{ + bufp->head = bufp->tail = 0; + memset(&bufp->events, 0, sizeof(struct rte_event) * EVENT_BUFFER_SZ); +} + +static int +event_buffer_add(struct event_buffer *bufp, struct rte_event *eventp) +{ + uint16_t head_idx; + struct rte_event *buf_eventp; + + if (event_buffer_full(bufp)) + return -1; + + /* Instead of modulus, bitwise AND with mask to get head_idx. */ + head_idx = bufp->head & EVENT_BUFFER_MASK; + buf_eventp = &bufp->events[head_idx]; + rte_memcpy(buf_eventp, eventp, sizeof(struct rte_event)); + + /* Wrap automatically when overflow occurs. */ + bufp->head++; + + return 0; +} + +static void +event_buffer_flush(struct event_buffer *bufp, uint8_t dev_id, uint8_t port_id, + uint16_t *nb_events_flushed, + uint16_t *nb_events_inv) +{ + uint16_t head_idx, tail_idx, n = 0; + struct rte_event *events = bufp->events; + + /* Instead of modulus, bitwise AND with mask to get index. */ + head_idx = bufp->head & EVENT_BUFFER_MASK; + tail_idx = bufp->tail & EVENT_BUFFER_MASK; + + /* Determine the largest contigous run we can attempt to enqueue to the + * event device. + */ + if (head_idx > tail_idx) + n = head_idx - tail_idx; + else if (head_idx < tail_idx) + n = EVENT_BUFFER_SZ - tail_idx; + else { + *nb_events_flushed = 0; + return; + } + + *nb_events_inv = 0; + *nb_events_flushed = rte_event_enqueue_burst(dev_id, port_id, + &events[tail_idx], n); + if (*nb_events_flushed != n && rte_errno == -EINVAL) { + EVTIM_LOG_ERR("failed to enqueue invalid event - dropping it"); + (*nb_events_inv)++; + } + + bufp->tail = bufp->tail + *nb_events_flushed + *nb_events_inv; +} + +/* + * Software event timer adapter implementation + */ + +struct rte_event_timer_adapter_sw_data { + /* List of messages for outstanding timers */ + TAILQ_HEAD(, msg) msgs_tailq_head; + /* Lock to guard tailq and armed count */ + rte_spinlock_t msgs_tailq_sl; + /* Identifier of service executing timer management logic. */ + uint32_t service_id; + /* The cycle count at which the adapter should next tick */ + uint64_t next_tick_cycles; + /* Incremented as the service moves through phases of an iteration */ + volatile int service_phase; + /* The tick resolution used by adapter instance. May have been + * adjusted from what user requested + */ + uint64_t timer_tick_ns; + /* Maximum timeout in nanoseconds allowed by adapter instance. */ + uint64_t max_tmo_ns; + /* Ring containing messages to arm or cancel event timers */ + struct rte_ring *msg_ring; + /* Mempool containing msg objects */ + struct rte_mempool *msg_pool; + /* Buffered timer expiry events to be enqueued to an event device. */ + struct event_buffer buffer; + /* Statistics */ + struct rte_event_timer_adapter_stats stats; + /* The number of threads currently adding to the message ring */ + rte_atomic16_t message_producer_count; +}; + +enum msg_type {MSG_TYPE_ARM, MSG_TYPE_CANCEL}; + +struct msg { + enum msg_type type; + struct rte_event_timer *evtim; + struct rte_timer tim; + TAILQ_ENTRY(msg) msgs; +}; + +static void +sw_event_timer_cb(struct rte_timer *tim, void *arg) +{ + int ret; + uint16_t nb_evs_flushed = 0; + uint16_t nb_evs_invalid = 0; + uint64_t opaque; + struct rte_event_timer *evtim; + struct rte_event_timer_adapter *adapter; + struct rte_event_timer_adapter_sw_data *sw_data; + + evtim = arg; + opaque = evtim->impl_opaque[1]; + adapter = (struct rte_event_timer_adapter *)(uintptr_t)opaque; + sw_data = adapter->data->adapter_priv; + + ret = event_buffer_add(&sw_data->buffer, &evtim->ev); + if (ret < 0) { + /* If event buffer is full, put timer back in list with + * immediate expiry value, so that we process it again on the + * next iteration. + */ + rte_timer_reset_sync(tim, 0, SINGLE, rte_lcore_id(), + sw_event_timer_cb, evtim); + + sw_data->stats.evtim_retry_count++; + EVTIM_LOG_DBG("event buffer full, resetting rte_timer with " + "immediate expiry value"); + } else { + struct msg *m = container_of(tim, struct msg, tim); + TAILQ_REMOVE(&sw_data->msgs_tailq_head, m, msgs); + EVTIM_BUF_LOG_DBG("buffered an event timer expiry event"); + evtim->state = RTE_EVENT_TIMER_NOT_ARMED; + + /* Free the msg object containing the rte_timer now that + * we've buffered its event successfully. + */ + rte_mempool_put(sw_data->msg_pool, m); + + /* Bump the count when we successfully add an expiry event to + * the buffer. + */ + sw_data->stats.evtim_exp_count++; + } + + if (event_buffer_batch_ready(&sw_data->buffer)) { + event_buffer_flush(&sw_data->buffer, + adapter->data->event_dev_id, + adapter->data->event_port_id, + &nb_evs_flushed, + &nb_evs_invalid); + + sw_data->stats.ev_enq_count += nb_evs_flushed; + sw_data->stats.ev_inv_count += nb_evs_invalid; + } +} + +static __rte_always_inline uint64_t +get_timeout_cycles(struct rte_event_timer *evtim, + struct rte_event_timer_adapter *adapter) +{ + uint64_t timeout_ns; + struct rte_event_timer_adapter_sw_data *sw_data; + + sw_data = adapter->data->adapter_priv; + timeout_ns = evtim->timeout_ticks * sw_data->timer_tick_ns; + return timeout_ns * rte_get_timer_hz() / NSECPERSEC; + +} + +/* This function returns true if one or more (adapter) ticks have occurred since + * the last time it was called. + */ +static inline bool +adapter_did_tick(struct rte_event_timer_adapter *adapter) +{ + uint64_t cycles_per_adapter_tick, start_cycles; + uint64_t *next_tick_cyclesp; + struct rte_event_timer_adapter_sw_data *sw_data; + + sw_data = adapter->data->adapter_priv; + next_tick_cyclesp = &sw_data->next_tick_cycles; + + cycles_per_adapter_tick = sw_data->timer_tick_ns * + (rte_get_timer_hz() / NSECPERSEC); + + start_cycles = rte_get_timer_cycles(); + + /* Note: initially, *next_tick_cyclesp == 0, so the clause below will + * execute, and set things going. + */ + + if (start_cycles >= *next_tick_cyclesp) { + /* Snap the current cycle count to the preceding adapter tick + * boundary. + */ + start_cycles -= start_cycles % cycles_per_adapter_tick; + + *next_tick_cyclesp = start_cycles + cycles_per_adapter_tick; + + return true; + } + + return false; +} + +/* Check that event timer timeout value is in range */ +static __rte_always_inline int +check_timeout(struct rte_event_timer *evtim, + const struct rte_event_timer_adapter *adapter) +{ + uint64_t tmo_nsec; + struct rte_event_timer_adapter_sw_data *sw_data; + + sw_data = adapter->data->adapter_priv; + tmo_nsec = evtim->timeout_ticks * sw_data->timer_tick_ns; + + if (tmo_nsec > sw_data->max_tmo_ns) + return -1; + + if (tmo_nsec < sw_data->timer_tick_ns) + return -2; + + return 0; +} + +/* Check that event timer event queue sched type matches destination event queue + * sched type + */ +static __rte_always_inline int +check_destination_event_queue(struct rte_event_timer *evtim, + const struct rte_event_timer_adapter *adapter) +{ + int ret; + uint32_t sched_type; + + ret = rte_event_queue_attr_get(adapter->data->event_dev_id, + evtim->ev.queue_id, + RTE_EVENT_QUEUE_ATTR_SCHEDULE_TYPE, + &sched_type); + + if ((ret < 0 && ret != -EOVERFLOW) || + evtim->ev.sched_type != sched_type) + return -1; + + return 0; +} + +#define NB_OBJS 32 +static int +sw_event_timer_adapter_service_func(void *arg) +{ + int i, num_msgs; + uint64_t cycles, opaque; + uint16_t nb_evs_flushed = 0; + uint16_t nb_evs_invalid = 0; + struct rte_event_timer_adapter *adapter; + struct rte_event_timer_adapter_sw_data *sw_data; + struct rte_event_timer *evtim = NULL; + struct rte_timer *tim = NULL; + struct msg *msg, *msgs[NB_OBJS]; + + adapter = arg; + sw_data = adapter->data->adapter_priv; + + sw_data->service_phase = 1; + rte_smp_wmb(); + + while (rte_atomic16_read(&sw_data->message_producer_count) > 0 || + !rte_ring_empty(sw_data->msg_ring)) { + + num_msgs = rte_ring_dequeue_burst(sw_data->msg_ring, + (void **)msgs, NB_OBJS, NULL); + + for (i = 0; i < num_msgs; i++) { + int ret = 0; + + RTE_SET_USED(ret); + + msg = msgs[i]; + evtim = msg->evtim; + + switch (msg->type) { + case MSG_TYPE_ARM: + EVTIM_SVC_LOG_DBG("dequeued ARM message from " + "ring"); + tim = &msg->tim; + rte_timer_init(tim); + cycles = get_timeout_cycles(evtim, + adapter); + ret = rte_timer_reset(tim, cycles, SINGLE, + rte_lcore_id(), + sw_event_timer_cb, + evtim); + RTE_ASSERT(ret == 0); + + evtim->impl_opaque[0] = (uintptr_t)tim; + evtim->impl_opaque[1] = (uintptr_t)adapter; + + TAILQ_INSERT_TAIL(&sw_data->msgs_tailq_head, + msg, + msgs); + break; + case MSG_TYPE_CANCEL: + EVTIM_SVC_LOG_DBG("dequeued CANCEL message " + "from ring"); + opaque = evtim->impl_opaque[0]; + tim = (struct rte_timer *)(uintptr_t)opaque; + RTE_ASSERT(tim != NULL); + + ret = rte_timer_stop(tim); + RTE_ASSERT(ret == 0); + + /* Free the msg object for the original arm + * request. + */ + struct msg *m; + m = container_of(tim, struct msg, tim); + TAILQ_REMOVE(&sw_data->msgs_tailq_head, m, + msgs); + rte_mempool_put(sw_data->msg_pool, m); + + /* Free the msg object for the current msg */ + rte_mempool_put(sw_data->msg_pool, msg); + + evtim->impl_opaque[0] = 0; + evtim->impl_opaque[1] = 0; + + break; + } + } + } + + sw_data->service_phase = 2; + rte_smp_wmb(); + + if (adapter_did_tick(adapter)) { + rte_timer_manage(); + + event_buffer_flush(&sw_data->buffer, + adapter->data->event_dev_id, + adapter->data->event_port_id, + &nb_evs_flushed, &nb_evs_invalid); + + sw_data->stats.ev_enq_count += nb_evs_flushed; + sw_data->stats.ev_inv_count += nb_evs_invalid; + sw_data->stats.adapter_tick_count++; + } + + sw_data->service_phase = 0; + rte_smp_wmb(); + + return 0; +} + +/* The adapter initialization function rounds the mempool size up to the next + * power of 2, so we can take the difference between that value and what the + * user requested, and use the space for caches. This avoids a scenario where a + * user can't arm the number of timers the adapter was configured with because + * mempool objects have been lost to caches. + * + * nb_actual should always be a power of 2, so we can iterate over the powers + * of 2 to see what the largest cache size we can use is. + */ +static int +compute_msg_mempool_cache_size(uint64_t nb_requested, uint64_t nb_actual) +{ + int i; + int size; + int cache_size = 0; + + for (i = 0; ; i++) { + size = 1 << i; + + if (RTE_MAX_LCORE * size < (int)(nb_actual - nb_requested) && + size < RTE_MEMPOOL_CACHE_MAX_SIZE && + size <= nb_actual / 1.5) + cache_size = size; + else + break; + } + + return cache_size; +} + +#define SW_MIN_INTERVAL 1E5 + +static int +sw_event_timer_adapter_init(struct rte_event_timer_adapter *adapter) +{ + int ret; + struct rte_event_timer_adapter_sw_data *sw_data; + uint64_t nb_timers; + unsigned int flags; + struct rte_service_spec service; + static bool timer_subsystem_inited; // static initialized to false + + /* Allocate storage for SW implementation data */ + char priv_data_name[RTE_RING_NAMESIZE]; + snprintf(priv_data_name, RTE_RING_NAMESIZE, "sw_evtim_adap_priv_%"PRIu8, + adapter->data->id); + adapter->data->adapter_priv = rte_zmalloc_socket( + priv_data_name, + sizeof(struct rte_event_timer_adapter_sw_data), + RTE_CACHE_LINE_SIZE, + adapter->data->socket_id); + if (adapter->data->adapter_priv == NULL) { + EVTIM_LOG_ERR("failed to allocate space for private data"); + rte_errno = ENOMEM; + return -1; + } + + if (adapter->data->conf.timer_tick_ns < SW_MIN_INTERVAL) { + EVTIM_LOG_ERR("failed to create adapter with requested tick " + "interval"); + rte_errno = EINVAL; + return -1; + } + + sw_data = adapter->data->adapter_priv; + + sw_data->timer_tick_ns = adapter->data->conf.timer_tick_ns; + sw_data->max_tmo_ns = adapter->data->conf.max_tmo_ns; + + TAILQ_INIT(&sw_data->msgs_tailq_head); + rte_spinlock_init(&sw_data->msgs_tailq_sl); + rte_atomic16_init(&sw_data->message_producer_count); + + /* Rings require power of 2, so round up to next such value */ + nb_timers = rte_align64pow2(adapter->data->conf.nb_timers); + + char msg_ring_name[RTE_RING_NAMESIZE]; + snprintf(msg_ring_name, RTE_RING_NAMESIZE, + "sw_evtim_adap_msg_ring_%"PRIu8, adapter->data->id); + flags = adapter->data->conf.flags & RTE_EVENT_TIMER_ADAPTER_F_SP_PUT ? + RING_F_SP_ENQ | RING_F_SC_DEQ : + RING_F_SC_DEQ; + sw_data->msg_ring = rte_ring_create(msg_ring_name, nb_timers, + adapter->data->socket_id, flags); + if (sw_data->msg_ring == NULL) { + EVTIM_LOG_ERR("failed to create message ring"); + rte_errno = ENOMEM; + goto free_priv_data; + } + + char pool_name[RTE_RING_NAMESIZE]; + snprintf(pool_name, RTE_RING_NAMESIZE, "sw_evtim_adap_msg_pool_%"PRIu8, + adapter->data->id); + + /* Both the arming/canceling thread and the service thread will do puts + * to the mempool, but if the SP_PUT flag is enabled, we can specify + * single-consumer get for the mempool. + */ + flags = adapter->data->conf.flags & RTE_EVENT_TIMER_ADAPTER_F_SP_PUT ? + MEMPOOL_F_SC_GET : 0; + + /* The usable size of a ring is count - 1, so subtract one here to + * make the counts agree. + */ + int pool_size = nb_timers - 1; + int cache_size = compute_msg_mempool_cache_size( + adapter->data->conf.nb_timers, nb_timers); + sw_data->msg_pool = rte_mempool_create(pool_name, pool_size, + sizeof(struct msg), cache_size, + 0, NULL, NULL, NULL, NULL, + adapter->data->socket_id, flags); + if (sw_data->msg_pool == NULL) { + EVTIM_LOG_ERR("failed to create message object mempool"); + rte_errno = ENOMEM; + goto free_msg_ring; + } + + event_buffer_init(&sw_data->buffer); + + /* Register a service component to run adapter logic */ + memset(&service, 0, sizeof(service)); + snprintf(service.name, RTE_SERVICE_NAME_MAX, + "sw_evimer_adap_svc_%"PRIu8, adapter->data->id); + service.socket_id = adapter->data->socket_id; + service.callback = sw_event_timer_adapter_service_func; + service.callback_userdata = adapter; + service.capabilities &= ~(RTE_SERVICE_CAP_MT_SAFE); + ret = rte_service_component_register(&service, &sw_data->service_id); + if (ret < 0) { + EVTIM_LOG_ERR("failed to register service %s with id %"PRIu32 + ": err = %d", service.name, sw_data->service_id, + ret); + + rte_errno = ENOSPC; + goto free_msg_pool; + } + + EVTIM_LOG_DBG("registered service %s with id %"PRIu32, service.name, + sw_data->service_id); + + adapter->data->service_id = sw_data->service_id; + adapter->data->service_inited = 1; + + if (!timer_subsystem_inited) { + rte_timer_subsystem_init(); + timer_subsystem_inited = true; + } + + return 0; + +free_msg_pool: + rte_mempool_free(sw_data->msg_pool); +free_msg_ring: + rte_ring_free(sw_data->msg_ring); +free_priv_data: + rte_free(sw_data); + return -1; +} + +static int +sw_event_timer_adapter_uninit(struct rte_event_timer_adapter *adapter) +{ + int ret; + struct msg *m1, *m2; + struct rte_event_timer_adapter_sw_data *sw_data = + adapter->data->adapter_priv; + + rte_spinlock_lock(&sw_data->msgs_tailq_sl); + + /* Cancel outstanding rte_timers and free msg objects */ + m1 = TAILQ_FIRST(&sw_data->msgs_tailq_head); + while (m1 != NULL) { + EVTIM_LOG_DBG("freeing outstanding timer"); + m2 = TAILQ_NEXT(m1, msgs); + + rte_timer_stop_sync(&m1->tim); + rte_mempool_put(sw_data->msg_pool, m1); + + m1 = m2; + } + + rte_spinlock_unlock(&sw_data->msgs_tailq_sl); + + ret = rte_service_component_unregister(sw_data->service_id); + if (ret < 0) { + EVTIM_LOG_ERR("failed to unregister service component"); + return ret; + } + + rte_ring_free(sw_data->msg_ring); + rte_mempool_free(sw_data->msg_pool); + rte_free(adapter->data->adapter_priv); + + return 0; +} + +static inline int32_t +get_mapped_count_for_service(uint32_t service_id) +{ + int32_t core_count, i, mapped_count = 0; + uint32_t lcore_arr[RTE_MAX_LCORE]; + + core_count = rte_service_lcore_list(lcore_arr, RTE_MAX_LCORE); + + for (i = 0; i < core_count; i++) + if (rte_service_map_lcore_get(service_id, lcore_arr[i]) == 1) + mapped_count++; + + return mapped_count; +} + +static int +sw_event_timer_adapter_start(const struct rte_event_timer_adapter *adapter) +{ + int mapped_count; + struct rte_event_timer_adapter_sw_data *sw_data; + + sw_data = adapter->data->adapter_priv; + + /* Mapping the service to more than one service core can introduce + * delays while one thread is waiting to acquire a lock, so only allow + * one core to be mapped to the service. + */ + mapped_count = get_mapped_count_for_service(sw_data->service_id); + + if (mapped_count == 1) + return rte_service_component_runstate_set(sw_data->service_id, + 1); + + return mapped_count < 1 ? -ENOENT : -ENOTSUP; +} + +static int +sw_event_timer_adapter_stop(const struct rte_event_timer_adapter *adapter) +{ + int ret; + struct rte_event_timer_adapter_sw_data *sw_data = + adapter->data->adapter_priv; + + ret = rte_service_component_runstate_set(sw_data->service_id, 0); + if (ret < 0) + return ret; + + /* Wait for the service to complete its final iteration before + * stopping. + */ + while (sw_data->service_phase != 0) + rte_pause(); + + rte_smp_rmb(); + + return 0; +} + +static void +sw_event_timer_adapter_get_info(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_info *adapter_info) +{ + struct rte_event_timer_adapter_sw_data *sw_data; + sw_data = adapter->data->adapter_priv; + + adapter_info->min_resolution_ns = sw_data->timer_tick_ns; + adapter_info->max_tmo_ns = sw_data->max_tmo_ns; +} + +static int +sw_event_timer_adapter_stats_get(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_stats *stats) +{ + struct rte_event_timer_adapter_sw_data *sw_data; + sw_data = adapter->data->adapter_priv; + *stats = sw_data->stats; + return 0; +} + +static int +sw_event_timer_adapter_stats_reset( + const struct rte_event_timer_adapter *adapter) +{ + struct rte_event_timer_adapter_sw_data *sw_data; + sw_data = adapter->data->adapter_priv; + memset(&sw_data->stats, 0, sizeof(sw_data->stats)); + return 0; +} + +static __rte_always_inline uint16_t +__sw_event_timer_arm_burst(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + uint16_t nb_evtims) +{ + uint16_t i; + int ret; + struct rte_event_timer_adapter_sw_data *sw_data; + struct msg *msgs[nb_evtims]; + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + /* Check that the service is running. */ + if (rte_service_runstate_get(adapter->data->service_id) != 1) { + rte_errno = EINVAL; + return 0; + } +#endif + + sw_data = adapter->data->adapter_priv; + + ret = rte_mempool_get_bulk(sw_data->msg_pool, (void **)msgs, nb_evtims); + if (ret < 0) { + rte_errno = ENOSPC; + return 0; + } + + /* Let the service know we're producing messages for it to process */ + rte_atomic16_inc(&sw_data->message_producer_count); + + /* If the service is managing timers, wait for it to finish */ + while (sw_data->service_phase == 2) + rte_pause(); + + rte_smp_rmb(); + + for (i = 0; i < nb_evtims; i++) { + /* Don't modify the event timer state in these cases */ + if (evtims[i]->state == RTE_EVENT_TIMER_ARMED) { + rte_errno = EALREADY; + break; + } else if (!(evtims[i]->state == RTE_EVENT_TIMER_NOT_ARMED || + evtims[i]->state == RTE_EVENT_TIMER_CANCELED)) { + rte_errno = EINVAL; + break; + } + + ret = check_timeout(evtims[i], adapter); + if (ret == -1) { + evtims[i]->state = RTE_EVENT_TIMER_ERROR_TOOLATE; + rte_errno = EINVAL; + break; + } + if (ret == -2) { + evtims[i]->state = RTE_EVENT_TIMER_ERROR_TOOEARLY; + rte_errno = EINVAL; + break; + } + + if (check_destination_event_queue(evtims[i], adapter) < 0) { + evtims[i]->state = RTE_EVENT_TIMER_ERROR; + rte_errno = EINVAL; + break; + } + + /* Checks passed, set up a message to enqueue */ + msgs[i]->type = MSG_TYPE_ARM; + msgs[i]->evtim = evtims[i]; + + /* Set the payload pointer if not set. */ + if (evtims[i]->ev.event_ptr == NULL) + evtims[i]->ev.event_ptr = evtims[i]; + + /* msg objects that get enqueued successfully will be freed + * either by a future cancel operation or by the timer + * expiration callback. + */ + if (rte_ring_enqueue(sw_data->msg_ring, msgs[i]) < 0) { + rte_errno = ENOSPC; + break; + } + + EVTIM_LOG_DBG("enqueued ARM message to ring"); + + evtims[i]->state = RTE_EVENT_TIMER_ARMED; + } + + /* Let the service know we're done producing messages */ + rte_atomic16_dec(&sw_data->message_producer_count); + + if (i < nb_evtims) + rte_mempool_put_bulk(sw_data->msg_pool, (void **)&msgs[i], + nb_evtims - i); + + return i; +} + +static uint16_t +sw_event_timer_arm_burst(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + uint16_t nb_evtims) +{ + return __sw_event_timer_arm_burst(adapter, evtims, nb_evtims); +} + +static uint16_t +sw_event_timer_cancel_burst(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + uint16_t nb_evtims) +{ + uint16_t i; + int ret; + struct rte_event_timer_adapter_sw_data *sw_data; + struct msg *msgs[nb_evtims]; + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + /* Check that the service is running. */ + if (rte_service_runstate_get(adapter->data->service_id) != 1) { + rte_errno = EINVAL; + return 0; + } +#endif + + sw_data = adapter->data->adapter_priv; + + ret = rte_mempool_get_bulk(sw_data->msg_pool, (void **)msgs, nb_evtims); + if (ret < 0) { + rte_errno = ENOSPC; + return 0; + } + + /* Let the service know we're producing messages for it to process */ + rte_atomic16_inc(&sw_data->message_producer_count); + + /* If the service could be modifying event timer states, wait */ + while (sw_data->service_phase == 2) + rte_pause(); + + rte_smp_rmb(); + + for (i = 0; i < nb_evtims; i++) { + /* Don't modify the event timer state in these cases */ + if (evtims[i]->state == RTE_EVENT_TIMER_CANCELED) { + rte_errno = EALREADY; + break; + } else if (evtims[i]->state != RTE_EVENT_TIMER_ARMED) { + rte_errno = EINVAL; + break; + } + + msgs[i]->type = MSG_TYPE_CANCEL; + msgs[i]->evtim = evtims[i]; + + if (rte_ring_enqueue(sw_data->msg_ring, msgs[i]) < 0) { + rte_errno = ENOSPC; + break; + } + + EVTIM_LOG_DBG("enqueued CANCEL message to ring"); + + evtims[i]->state = RTE_EVENT_TIMER_CANCELED; + } + + /* Let the service know we're done producing messages */ + rte_atomic16_dec(&sw_data->message_producer_count); + + if (i < nb_evtims) + rte_mempool_put_bulk(sw_data->msg_pool, (void **)&msgs[i], + nb_evtims - i); + + return i; +} + +static uint16_t +sw_event_timer_arm_tmo_tick_burst(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + uint64_t timeout_ticks, + uint16_t nb_evtims) +{ + int i; + + for (i = 0; i < nb_evtims; i++) + evtims[i]->timeout_ticks = timeout_ticks; + + return __sw_event_timer_arm_burst(adapter, evtims, nb_evtims); +} + +static const struct rte_event_timer_adapter_ops sw_event_adapter_timer_ops = { + .init = sw_event_timer_adapter_init, + .uninit = sw_event_timer_adapter_uninit, + .start = sw_event_timer_adapter_start, + .stop = sw_event_timer_adapter_stop, + .get_info = sw_event_timer_adapter_get_info, + .stats_get = sw_event_timer_adapter_stats_get, + .stats_reset = sw_event_timer_adapter_stats_reset, + .arm_burst = sw_event_timer_arm_burst, + .arm_tmo_tick_burst = sw_event_timer_arm_tmo_tick_burst, + .cancel_burst = sw_event_timer_cancel_burst, +}; + +RTE_INIT(event_timer_adapter_init_log) +{ + evtim_logtype = rte_log_register("lib.eventdev.adapter.timer"); + if (evtim_logtype >= 0) + rte_log_set_level(evtim_logtype, RTE_LOG_NOTICE); + + evtim_buffer_logtype = rte_log_register("lib.eventdev.adapter.timer." + "buffer"); + if (evtim_buffer_logtype >= 0) + rte_log_set_level(evtim_buffer_logtype, RTE_LOG_NOTICE); + + evtim_svc_logtype = rte_log_register("lib.eventdev.adapter.timer.svc"); + if (evtim_svc_logtype >= 0) + rte_log_set_level(evtim_svc_logtype, RTE_LOG_NOTICE); +} diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.h b/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.h new file mode 100644 index 00000000..d4ea6f17 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter.h @@ -0,0 +1,766 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc. + * Copyright(c) 2017-2018 Intel Corporation. + * All rights reserved. + */ + +#ifndef __RTE_EVENT_TIMER_ADAPTER_H__ +#define __RTE_EVENT_TIMER_ADAPTER_H__ + +/** + * @file + * + * RTE Event Timer Adapter + * + * An event timer adapter has the following abstract working model: + * + * timer_tick_ns + * + + * +-------+ | + * | | | + * +-------+ bkt 0 +----v---+ + * | | | | + * | +-------+ | + * +---+---+ +---+---+ +---+---+---+---+ + * | | | | | | | | | + * | bkt n | | bkt 1 |<-> t0| t1| t2| tn| + * | | | | | | | | | + * +---+---+ +---+---+ +---+---+---+---+ + * | Timer adapter | + * +---+---+ +---+---+ + * | | | | + * | bkt 4 | | bkt 2 |<--- Current bucket + * | | | | + * +---+---+ +---+---+ + * | +-------+ | + * | | | | + * +------+ bkt 3 +-------+ + * | | + * +-------+ + * + * - It has a virtual monotonically increasing 64-bit timer adapter clock based + * on *enum rte_event_timer_adapter_clk_src* clock source. The clock source + * could be a CPU clock, or a platform dependent external clock. + * + * - The application creates a timer adapter instance with given the clock + * source, the total number of event timers, and a resolution(expressed in ns) + * to traverse between the buckets. + * + * - Each timer adapter may have 0 to n buckets based on the configured + * max timeout(max_tmo_ns) and resolution(timer_tick_ns). Upon starting the + * timer adapter, the adapter starts ticking at *timer_tick_ns* resolution. + * + * - The application arms an event timer that will expire *timer_tick_ns* + * from now. + * + * - The application can cancel an armed timer and no timer expiry event will be + * generated. + * + * - If a timer expires then the library injects the timer expiry event in + * the designated event queue. + * + * - The timer expiry event will be received through *rte_event_dequeue_burst*. + * + * - The application frees the timer adapter instance. + * + * Multiple timer adapters can be created with a varying level of resolution + * for various expiry use cases that run in parallel. + * + * Before using the timer adapter, the application has to create and configure + * an event device along with the event port. Based on the event device + * capability it might require creating an additional event port to be used + * by the timer adapter. + * + * The application creates the event timer adapter using the + * ``rte_event_timer_adapter_create()``. The event device id is passed to this + * function, inside this function the event device capability is checked, + * and if an in-built port is absent the application uses the default + * function to create a new producer port. + * + * The application may also use the function + * ``rte_event_timer_adapter_create_ext()`` to have granular control over + * producer port creation in a case where the in-built port is absent. + * + * After creating the timer adapter, the application has to start it + * using ``rte_event_timer_adapter_start()``. The buckets are traversed from + * 0 to n; when the adapter ticks, the next bucket is visited. Each time, + * the list per bucket is processed, and timer expiry events are sent to the + * designated event queue. + * + * The application can arm one or more event timers using the + * ``rte_event_timer_arm_burst()``. The *timeout_ticks* represents the number + * of *timer_tick_ns* after which the timer has to expire. The timeout at + * which the timers expire can be grouped or be independent of each + * event timer instance. ``rte_event_timer_arm_tmo_tick_burst()`` addresses the + * former case and ``rte_event_timer_arm_burst()`` addresses the latter case. + * + * The application can cancel the timers from expiring using the + * ``rte_event_timer_cancel_burst()``. + * + * On the secondary process, ``rte_event_timer_adapter_lookup()`` can be used + * to get the timer adapter pointer from its id and use it to invoke fastpath + * operations such as arm and cancel. + * + * Some of the use cases of event timer adapter are Beacon Timers, + * Generic SW Timeout, Wireless MAC Scheduling, 3G Frame Protocols, + * Packet Scheduling, Protocol Retransmission Timers, Supervision Timers. + * All these use cases require high resolution and low time drift. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "rte_eventdev.h" + +/** + * @warning + * @b EXPERIMENTAL: this enum may change without prior notice + * + * Timer adapter clock source + */ +enum rte_event_timer_adapter_clk_src { + RTE_EVENT_TIMER_ADAPTER_CPU_CLK, + /**< Use CPU clock as the clock source. */ + RTE_EVENT_TIMER_ADAPTER_EXT_CLK0, + /**< Platform dependent external clock source 0. */ + RTE_EVENT_TIMER_ADAPTER_EXT_CLK1, + /**< Platform dependent external clock source 1. */ + RTE_EVENT_TIMER_ADAPTER_EXT_CLK2, + /**< Platform dependent external clock source 2. */ + RTE_EVENT_TIMER_ADAPTER_EXT_CLK3, + /**< Platform dependent external clock source 3. */ +}; + +#define RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES (1ULL << 0) +/**< The event timer adapter implementation may have constraints on the + * resolution (timer_tick_ns) and maximum timer expiry timeout(max_tmo_ns) + * based on the given timer adapter or system. If this flag is set, the + * implementation adjusts the resolution and maximum timeout to the best + * possible configuration. On successful timer adapter creation, the + * application can get the configured resolution and max timeout with + * ``rte_event_timer_adapter_get_info()``. + * + * @see struct rte_event_timer_adapter_info::min_resolution_ns + * @see struct rte_event_timer_adapter_info::max_tmo_ns + */ +#define RTE_EVENT_TIMER_ADAPTER_F_SP_PUT (1ULL << 1) +/**< ``rte_event_timer_arm_burst()`` API to be used in single producer mode. + * + * @see struct rte_event_timer_adapter_conf::flags + */ + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Timer adapter configuration structure + */ +struct rte_event_timer_adapter_conf { + uint8_t event_dev_id; + /**< Event device identifier */ + uint16_t timer_adapter_id; + /**< Event timer adapter identifier */ + uint32_t socket_id; + /**< Identifier of socket from which to allocate memory for adapter */ + enum rte_event_timer_adapter_clk_src clk_src; + /**< Clock source for timer adapter */ + uint64_t timer_tick_ns; + /**< Timer adapter resolution in ns */ + uint64_t max_tmo_ns; + /**< Maximum timer timeout(expiry) in ns */ + uint64_t nb_timers; + /**< Total number of timers per adapter */ + uint64_t flags; + /**< Timer adapter config flags (RTE_EVENT_TIMER_ADAPTER_F_*) */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Event timer adapter stats structure + */ +struct rte_event_timer_adapter_stats { + uint64_t evtim_exp_count; + /**< Number of event timers that have expired. */ + uint64_t ev_enq_count; + /**< Eventdev enqueue count */ + uint64_t ev_inv_count; + /**< Invalid expiry event count */ + uint64_t evtim_retry_count; + /**< Event timer retry count */ + uint64_t adapter_tick_count; + /**< Tick count for the adapter, at its resolution */ +}; + +struct rte_event_timer_adapter; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Callback function type for producer port creation. + */ +typedef int (*rte_event_timer_adapter_port_conf_cb_t)(uint16_t id, + uint8_t event_dev_id, + uint8_t *event_port_id, + void *conf_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create an event timer adapter. + * + * This function must be invoked first before any other function in the API. + * + * @param conf + * The event timer adapter configuration structure. + * + * @return + * A pointer to the new allocated event timer adapter on success. + * NULL on error with rte_errno set appropriately. + * Possible rte_errno values include: + * - ERANGE: timer_tick_ns is not in supported range. + * - ENOMEM: unable to allocate sufficient memory for adapter instances + * - EINVAL: invalid event device identifier specified in config + * - ENOSPC: maximum number of adapters already created + * - EIO: event device reconfiguration and restart error. The adapter + * reconfigures the event device with an additional port by default if it is + * required to use a service to manage timers. If the device had been started + * before this call, this error code indicates an error in restart following + * an error in reconfiguration, i.e., a combination of the two error codes. + */ +struct rte_event_timer_adapter * __rte_experimental +rte_event_timer_adapter_create(const struct rte_event_timer_adapter_conf *conf); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create a timer adapter with the supplied callback. + * + * This function can be used to have a more granular control over the timer + * adapter creation. If a built-in port is absent, then the function uses the + * callback provided to create and get the port id to be used as a producer + * port. + * + * @param conf + * The timer adapter configuration structure + * @param conf_cb + * The port config callback function. + * @param conf_arg + * Opaque pointer to the argument for the callback function + * + * @return + * A pointer to the new allocated event timer adapter on success. + * NULL on error with rte_errno set appropriately. + * Possible rte_errno values include: + * - ERANGE: timer_tick_ns is not in supported range. + * - ENOMEM: unable to allocate sufficient memory for adapter instances + * - EINVAL: invalid event device identifier specified in config + * - ENOSPC: maximum number of adapters already created + */ +struct rte_event_timer_adapter * __rte_experimental +rte_event_timer_adapter_create_ext( + const struct rte_event_timer_adapter_conf *conf, + rte_event_timer_adapter_port_conf_cb_t conf_cb, + void *conf_arg); + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Timer adapter info structure. + */ +struct rte_event_timer_adapter_info { + uint64_t min_resolution_ns; + /**< Minimum timer adapter resolution in ns */ + uint64_t max_tmo_ns; + /**< Maximum timer timeout(expire) in ns */ + struct rte_event_timer_adapter_conf conf; + /**< Configured timer adapter attributes */ + uint32_t caps; + /**< Event timer adapter capabilities */ + int16_t event_dev_port_id; + /**< Event device port ID, if applicable */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the contextual information of an event timer adapter. + * + * @param adapter + * A pointer to the event timer adapter structure. + * + * @param[out] adapter_info + * A pointer to a structure of type *rte_event_timer_adapter_info* to be + * filled with the contextual information of the adapter. + * + * @return + * - 0: Success, driver updates the contextual information of the + * timer adapter + * - <0: Error code returned by the driver info get function. + * - -EINVAL: adapter identifier invalid + * + * @see RTE_EVENT_TIMER_ADAPTER_F_ADJUST_RES, + * struct rte_event_timer_adapter_info + * + */ +int __rte_experimental +rte_event_timer_adapter_get_info( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_info *adapter_info); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start a timer adapter. + * + * The adapter start step is the last one and consists of setting the timer + * adapter to start accepting the timers and schedules to event queues. + * + * On success, all basic functions exported by the API (timer arm, + * timer cancel and so on) can be invoked. + * + * @param adapter + * A pointer to the event timer adapter structure. + * + * @return + * - 0: Success, adapter started. + * - <0: Error code returned by the driver start function. + * - -EINVAL if adapter identifier invalid + * - -ENOENT if software adapter but no service core mapped + * - -ENOTSUP if software adapter and more than one service core mapped + */ +int __rte_experimental +rte_event_timer_adapter_start( + const struct rte_event_timer_adapter *adapter); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop an event timer adapter. + * + * The adapter can be restarted with a call to + * ``rte_event_timer_adapter_start()``. + * + * @param adapter + * A pointer to the event timer adapter structure. + * + * @return + * - 0: Success, adapter stopped. + * - <0: Error code returned by the driver stop function. + * - -EINVAL if adapter identifier invalid + */ +int __rte_experimental +rte_event_timer_adapter_stop(const struct rte_event_timer_adapter *adapter); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Lookup an event timer adapter using its identifier. + * + * If an event timer adapter was created in another process with the same + * identifier, this function will locate its state and set up access to it + * so that it can be used in this process. + * + * @param adapter_id + * The event timer adapter identifier. + * + * @return + * A pointer to the event timer adapter matching the identifier on success. + * NULL on error with rte_errno set appropriately. + * Possible rte_errno values include: + * - ENOENT - requested entry not available to return. + */ +struct rte_event_timer_adapter * __rte_experimental +rte_event_timer_adapter_lookup(uint16_t adapter_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Free an event timer adapter. + * + * Destroy an event timer adapter, freeing all resources. + * + * Before invoking this function, the application must wait for all the + * armed timers to expire or cancel the outstanding armed timers. + * + * @param adapter + * A pointer to an event timer adapter structure. + * + * @return + * - 0: Successfully freed the event timer adapter resources. + * - <0: Failed to free the event timer adapter resources. + * - -EAGAIN: adapter is busy; timers outstanding + * - -EBUSY: stop hasn't been called for this adapter yet + * - -EINVAL: adapter id invalid, or adapter invalid + */ +int __rte_experimental +rte_event_timer_adapter_free(struct rte_event_timer_adapter *adapter); + +/** + * Retrieve the service ID of the event timer adapter. If the adapter doesn't + * use an rte_service function, this function returns -ESRCH. + * + * @param adapter + * A pointer to an event timer adapter. + * + * @param [out] service_id + * A pointer to a uint32_t, to be filled in with the service id. + * + * @return + * - 0: Success + * - <0: Error code on failure + * - -ESRCH: the adapter does not require a service to operate + */ +int __rte_experimental +rte_event_timer_adapter_service_id_get(struct rte_event_timer_adapter *adapter, + uint32_t *service_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve statistics for an event timer adapter instance. + * + * @param adapter + * A pointer to an event timer adapter structure. + * @param[out] stats + * A pointer to a structure to fill with statistics. + * + * @return + * - 0: Successfully retrieved. + * - <0: Failure; error code returned. + */ +int __rte_experimental +rte_event_timer_adapter_stats_get(struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_stats *stats); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset statistics for an event timer adapter instance. + * + * @param adapter + * A pointer to an event timer adapter structure. + * + * @return + * - 0: Successfully reset; + * - <0: Failure; error code returned. + */ +int __rte_experimental rte_event_timer_adapter_stats_reset( + struct rte_event_timer_adapter *adapter); + +/** + * Retrieve the service ID of the event timer adapter. If the adapter doesn't + * use an rte_service function, this function returns -ESRCH. + * + * @param adapter + * A pointer to an event timer adapter. + * + * @param [out] service_id + * A pointer to a uint32_t, to be filled in with the service id. + * + * @return + * - 0: Success + * - <0: Error code on failure, if the event dev doesn't use a rte_service + * function, this function returns -ESRCH. + */ +int +rte_event_timer_adapter_service_id_get(struct rte_event_timer_adapter *adapter, + uint32_t *service_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve statistics for an event timer adapter instance. + * + * @param adapter + * A pointer to an event timer adapter structure. + * @param[out] stats + * A pointer to a structure to fill with statistics. + * + * @return + * - 0: Successfully retrieved. + * - <0: Failure; error code returned. + */ +int rte_event_timer_adapter_stats_get(struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_stats *stats); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset statistics for an event timer adapter instance. + * + * @param adapter + * A pointer to an event timer adapter structure. + * + * @return + * - 0: Successfully reset; + * - <0: Failure; error code returned. + */ +int rte_event_timer_adapter_stats_reset( + struct rte_event_timer_adapter *adapter); + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * Event timer state. + */ +enum rte_event_timer_state { + RTE_EVENT_TIMER_NOT_ARMED = 0, + /**< Event timer not armed. */ + RTE_EVENT_TIMER_ARMED = 1, + /**< Event timer successfully armed. */ + RTE_EVENT_TIMER_CANCELED = 2, + /**< Event timer successfully canceled. */ + RTE_EVENT_TIMER_ERROR = -1, + /**< Generic event timer error. */ + RTE_EVENT_TIMER_ERROR_TOOEARLY = -2, + /**< Event timer timeout tick value is too small for the adapter to + * handle, given its configured resolution. + */ + RTE_EVENT_TIMER_ERROR_TOOLATE = -3, + /**< Event timer timeout tick is greater than the maximum timeout.*/ +}; + +/** + * @warning + * @b EXPERIMENTAL: this structure may change without prior notice + * + * The generic *rte_event_timer* structure to hold the event timer attributes + * for arm and cancel operations. + */ +RTE_STD_C11 +struct rte_event_timer { + struct rte_event ev; + /**< + * Expiry event attributes. On successful event timer timeout, + * the following attributes will be used to inject the expiry event to + * the eventdev: + * - event_queue_id: Targeted event queue id for expiry events. + * - event_priority: Event priority of the event expiry event in the + * event queue relative to other events. + * - sched_type: Scheduling type of the expiry event. + * - flow_id: Flow id of the expiry event. + * - op: RTE_EVENT_OP_NEW + * - event_type: RTE_EVENT_TYPE_TIMER + */ + volatile enum rte_event_timer_state state; + /**< State of the event timer. */ + uint64_t timeout_ticks; + /**< Expiry timer ticks expressed in number of *timer_ticks_ns* from + * now. + * @see struct rte_event_timer_adapter_info::adapter_conf::timer_tick_ns + */ + uint64_t impl_opaque[2]; + /**< Implementation-specific opaque data. + * An event timer adapter implementation use this field to hold + * implementation specific values to share between the arm and cancel + * operations. The application should not modify this field. + */ + uint8_t user_meta[0]; + /**< Memory to store user specific metadata. + * The event timer adapter implementation should not modify this area. + */ +} __rte_cache_aligned; + +typedef uint16_t (*rte_event_timer_arm_burst_t)( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **tims, + uint16_t nb_tims); +/**< @internal Enable event timers to enqueue timer events upon expiry */ +typedef uint16_t (*rte_event_timer_arm_tmo_tick_burst_t)( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **tims, + uint64_t timeout_tick, + uint16_t nb_tims); +/**< @internal Enable event timers with common expiration time */ +typedef uint16_t (*rte_event_timer_cancel_burst_t)( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **tims, + uint16_t nb_tims); +/**< @internal Prevent event timers from enqueuing timer events */ + +/** + * @internal Data structure associated with each event timer adapter. + */ +struct rte_event_timer_adapter { + rte_event_timer_arm_burst_t arm_burst; + /**< Pointer to driver arm_burst function. */ + rte_event_timer_arm_tmo_tick_burst_t arm_tmo_tick_burst; + /**< Pointer to driver arm_tmo_tick_burst function. */ + rte_event_timer_cancel_burst_t cancel_burst; + /**< Pointer to driver cancel function. */ + struct rte_event_timer_adapter_data *data; + /**< Pointer to shared adapter data */ + const struct rte_event_timer_adapter_ops *ops; + /**< Functions exported by adapter driver */ + + RTE_STD_C11 + uint8_t allocated : 1; + /**< Flag to indicate that this adapter has been allocated */ +} __rte_cache_aligned; + +#define ADAPTER_VALID_OR_ERR_RET(adapter, retval) do { \ + if (adapter == NULL || !adapter->allocated) \ + return retval; \ +} while (0) + +#define FUNC_PTR_OR_ERR_RET(func, errval) do { \ + if ((func) == NULL) \ + return errval; \ +} while (0) + +#define FUNC_PTR_OR_NULL_RET_WITH_ERRNO(func, errval) do { \ + if ((func) == NULL) { \ + rte_errno = errval; \ + return NULL; \ + } \ +} while (0) + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Arm a burst of event timers with separate expiration timeout tick for each + * event timer. + * + * Before calling this function, the application allocates + * ``struct rte_event_timer`` objects from mempool or huge page backed + * application buffers of desired size. On successful allocation, + * application updates the `struct rte_event_timer`` attributes such as + * expiry event attributes, timeout ticks from now. + * This function submits the event timer arm requests to the event timer adapter + * and on expiry, the events will be injected to designated event queue. + * + * @param adapter + * A pointer to an event timer adapter structure. + * @param evtims + * Pointer to an array of objects of type *rte_event_timer* structure. + * @param nb_evtims + * Number of event timers in the supplied array. + * + * @return + * The number of successfully armed event timers. The return value can be less + * than the value of the *nb_evtims* parameter. If the return value is less + * than *nb_evtims*, the remaining event timers at the end of *evtims* + * are not consumed, and the caller has to take care of them, and rte_errno + * is set accordingly. Possible errno values include: + * - EINVAL Invalid timer adapter, expiry event queue ID is invalid, or an + * expiry event's sched type doesn't match the capabilities of the + * destination event queue. + * - EAGAIN Specified timer adapter is not running + * - EALREADY A timer was encountered that was already armed + */ +static inline uint16_t __rte_experimental +rte_event_timer_arm_burst(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + uint16_t nb_evtims) +{ +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->arm_burst, -EINVAL); +#endif + return adapter->arm_burst(adapter, evtims, nb_evtims); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Arm a burst of event timers with same expiration timeout tick. + * + * Provides the same functionality as ``rte_event_timer_arm_burst()``, except + * that application can use this API when all the event timers have the + * same timeout expiration tick. This specialized function can provide the + * additional hint to the adapter implementation and optimize if possible. + * + * @param adapter + * A pointer to an event timer adapter structure. + * @param evtims + * Points to an array of objects of type *rte_event_timer* structure. + * @param timeout_ticks + * The number of ticks in which the timers should expire. + * @param nb_evtims + * Number of event timers in the supplied array. + * + * @return + * The number of successfully armed event timers. The return value can be less + * than the value of the *nb_evtims* parameter. If the return value is less + * than *nb_evtims*, the remaining event timers at the end of *evtims* + * are not consumed, and the caller has to take care of them, and rte_errno + * is set accordingly. Possible errno values include: + * - EINVAL Invalid timer adapter, expiry event queue ID is invalid, or an + * expiry event's sched type doesn't match the capabilities of the + * destination event queue. + * - EAGAIN Specified event timer adapter is not running + * - EALREADY A timer was encountered that was already armed + */ +static inline uint16_t __rte_experimental +rte_event_timer_arm_tmo_tick_burst( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + const uint64_t timeout_ticks, + const uint16_t nb_evtims) +{ +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->arm_tmo_tick_burst, -EINVAL); +#endif + return adapter->arm_tmo_tick_burst(adapter, evtims, timeout_ticks, + nb_evtims); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Cancel a burst of event timers from being scheduled to the event device. + * + * @param adapter + * A pointer to an event timer adapter structure. + * @param evtims + * Points to an array of objects of type *rte_event_timer* structure + * @param nb_evtims + * Number of event timer instances in the supplied array. + * + * @return + * The number of successfully canceled event timers. The return value can be + * less than the value of the *nb_evtims* parameter. If the return value is + * less than *nb_evtims*, the remaining event timers at the end of *evtims* + * are not consumed, and the caller has to take care of them, and rte_errno + * is set accordingly. Possible errno values include: + * - EINVAL Invalid timer adapter identifier + * - EAGAIN Specified timer adapter is not running + * - EALREADY A timer was encountered that was already canceled + */ +static inline uint16_t __rte_experimental +rte_event_timer_cancel_burst(const struct rte_event_timer_adapter *adapter, + struct rte_event_timer **evtims, + uint16_t nb_evtims) +{ +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL); + FUNC_PTR_OR_ERR_RET(adapter->cancel_burst, -EINVAL); +#endif + return adapter->cancel_burst(adapter, evtims, nb_evtims); +} + +#endif /* __RTE_EVENT_TIMER_ADAPTER_H__ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter_pmd.h b/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter_pmd.h new file mode 100644 index 00000000..cf3509dc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_event_timer_adapter_pmd.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation. + * All rights reserved. + */ + +#ifndef __RTE_EVENT_TIMER_ADAPTER_PMD_H__ +#define __RTE_EVENT_TIMER_ADAPTER_PMD_H__ + +/** + * @file + * RTE Event Timer Adapter API (PMD Side) + * + * @note + * This file provides implementation helpers for internal use by PMDs. They + * are not intended to be exposed to applications and are not subject to ABI + * versioning. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "rte_event_timer_adapter.h" + +/* + * Definitions of functions exported by an event timer adapter implementation + * through *rte_event_timer_adapter_ops* structure supplied in the + * *rte_event_timer_adapter* structure associated with an event timer adapter. + */ + +typedef int (*rte_event_timer_adapter_init_t)( + struct rte_event_timer_adapter *adapter); +/**< @internal Event timer adapter implementation setup */ +typedef int (*rte_event_timer_adapter_uninit_t)( + struct rte_event_timer_adapter *adapter); +/**< @internal Event timer adapter implementation teardown */ +typedef int (*rte_event_timer_adapter_start_t)( + const struct rte_event_timer_adapter *adapter); +/**< @internal Start running event timer adapter */ +typedef int (*rte_event_timer_adapter_stop_t)( + const struct rte_event_timer_adapter *adapter); +/**< @internal Stop running event timer adapter */ +typedef void (*rte_event_timer_adapter_get_info_t)( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_info *adapter_info); +/**< @internal Get contextual information for event timer adapter */ +typedef int (*rte_event_timer_adapter_stats_get_t)( + const struct rte_event_timer_adapter *adapter, + struct rte_event_timer_adapter_stats *stats); +/**< @internal Get statistics for event timer adapter */ +typedef int (*rte_event_timer_adapter_stats_reset_t)( + const struct rte_event_timer_adapter *adapter); +/**< @internal Reset statistics for event timer adapter */ + +/** + * @internal Structure containing the functions exported by an event timer + * adapter implementation. + */ +struct rte_event_timer_adapter_ops { + rte_event_timer_adapter_init_t init; /**< Set up adapter */ + rte_event_timer_adapter_uninit_t uninit;/**< Tear down adapter */ + rte_event_timer_adapter_start_t start; /**< Start adapter */ + rte_event_timer_adapter_stop_t stop; /**< Stop adapter */ + rte_event_timer_adapter_get_info_t get_info; + /**< Get info from driver */ + rte_event_timer_adapter_stats_get_t stats_get; + /**< Get adapter statistics */ + rte_event_timer_adapter_stats_reset_t stats_reset; + /**< Reset adapter statistics */ + rte_event_timer_arm_burst_t arm_burst; + /**< Arm one or more event timers */ + rte_event_timer_arm_tmo_tick_burst_t arm_tmo_tick_burst; + /**< Arm event timers with same expiration time */ + rte_event_timer_cancel_burst_t cancel_burst; + /**< Cancel one or more event timers */ +}; + +/** + * @internal Adapter data; structure to be placed in shared memory to be + * accessible by various processes in a multi-process configuration. + */ +struct rte_event_timer_adapter_data { + uint8_t id; + /**< Event timer adapter ID */ + uint8_t event_dev_id; + /**< Event device ID */ + uint32_t socket_id; + /**< Socket ID where memory is allocated */ + uint8_t event_port_id; + /**< Optional: event port ID used when the inbuilt port is absent */ + const struct rte_memzone *mz; + /**< Event timer adapter memzone pointer */ + struct rte_event_timer_adapter_conf conf; + /**< Configuration used to configure the adapter. */ + uint32_t caps; + /**< Adapter capabilities */ + void *adapter_priv; + /**< Timer adapter private data*/ + uint8_t service_inited; + /**< Service initialization state */ + uint32_t service_id; + /**< Service ID*/ + + RTE_STD_C11 + uint8_t started : 1; + /**< Flag to indicate adapter started. */ +} __rte_cache_aligned; + +#ifdef __cplusplus +} +#endif + +#endif /* __RTE_EVENT_TIMER_ADAPTER_PMD_H__ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.c b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.c new file mode 100644 index 00000000..801810ed --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.c @@ -0,0 +1,1357 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_eventdev.h" +#include "rte_eventdev_pmd.h" + +struct rte_eventdev rte_event_devices[RTE_EVENT_MAX_DEVS]; + +struct rte_eventdev *rte_eventdevs = &rte_event_devices[0]; + +static struct rte_eventdev_global eventdev_globals = { + .nb_devs = 0 +}; + +struct rte_eventdev_global *rte_eventdev_globals = &eventdev_globals; + +/* Event dev north bound API implementation */ + +uint8_t +rte_event_dev_count(void) +{ + return rte_eventdev_globals->nb_devs; +} + +int +rte_event_dev_get_dev_id(const char *name) +{ + int i; + uint8_t cmp; + + if (!name) + return -EINVAL; + + for (i = 0; i < rte_eventdev_globals->nb_devs; i++) { + cmp = (strncmp(rte_event_devices[i].data->name, name, + RTE_EVENTDEV_NAME_MAX_LEN) == 0) || + (rte_event_devices[i].dev ? (strncmp( + rte_event_devices[i].dev->driver->name, name, + RTE_EVENTDEV_NAME_MAX_LEN) == 0) : 0); + if (cmp && (rte_event_devices[i].attached == + RTE_EVENTDEV_ATTACHED)) + return i; + } + return -ENODEV; +} + +int +rte_event_dev_socket_id(uint8_t dev_id) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + return dev->data->socket_id; +} + +int +rte_event_dev_info_get(uint8_t dev_id, struct rte_event_dev_info *dev_info) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + if (dev_info == NULL) + return -EINVAL; + + memset(dev_info, 0, sizeof(struct rte_event_dev_info)); + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + (*dev->dev_ops->dev_infos_get)(dev, dev_info); + + dev_info->dequeue_timeout_ns = dev->data->dev_conf.dequeue_timeout_ns; + + dev_info->dev = dev->dev; + return 0; +} + +int +rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id, + uint32_t *caps) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + RTE_ETH_VALID_PORTID_OR_ERR_RET(eth_port_id, -EINVAL); + + dev = &rte_eventdevs[dev_id]; + + if (caps == NULL) + return -EINVAL; + *caps = 0; + + return dev->dev_ops->eth_rx_adapter_caps_get ? + (*dev->dev_ops->eth_rx_adapter_caps_get)(dev, + &rte_eth_devices[eth_port_id], + caps) + : 0; +} + +int __rte_experimental +rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps) +{ + struct rte_eventdev *dev; + const struct rte_event_timer_adapter_ops *ops; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + + dev = &rte_eventdevs[dev_id]; + + if (caps == NULL) + return -EINVAL; + *caps = 0; + + return dev->dev_ops->timer_adapter_caps_get ? + (*dev->dev_ops->timer_adapter_caps_get)(dev, + 0, + caps, + &ops) + : 0; +} + +int __rte_experimental +rte_event_crypto_adapter_caps_get(uint8_t dev_id, uint8_t cdev_id, + uint32_t *caps) +{ + struct rte_eventdev *dev; + struct rte_cryptodev *cdev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + if (!rte_cryptodev_pmd_is_valid_dev(cdev_id)) + return -EINVAL; + + dev = &rte_eventdevs[dev_id]; + cdev = rte_cryptodev_pmd_get_dev(cdev_id); + + if (caps == NULL) + return -EINVAL; + *caps = 0; + + return dev->dev_ops->crypto_adapter_caps_get ? + (*dev->dev_ops->crypto_adapter_caps_get) + (dev, cdev, caps) : -ENOTSUP; +} + +static inline int +rte_event_dev_queue_config(struct rte_eventdev *dev, uint8_t nb_queues) +{ + uint8_t old_nb_queues = dev->data->nb_queues; + struct rte_event_queue_conf *queues_cfg; + unsigned int i; + + RTE_EDEV_LOG_DEBUG("Setup %d queues on device %u", nb_queues, + dev->data->dev_id); + + /* First time configuration */ + if (dev->data->queues_cfg == NULL && nb_queues != 0) { + /* Allocate memory to store queue configuration */ + dev->data->queues_cfg = rte_zmalloc_socket( + "eventdev->data->queues_cfg", + sizeof(dev->data->queues_cfg[0]) * nb_queues, + RTE_CACHE_LINE_SIZE, dev->data->socket_id); + if (dev->data->queues_cfg == NULL) { + dev->data->nb_queues = 0; + RTE_EDEV_LOG_ERR("failed to get mem for queue cfg," + "nb_queues %u", nb_queues); + return -(ENOMEM); + } + /* Re-configure */ + } else if (dev->data->queues_cfg != NULL && nb_queues != 0) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP); + + for (i = nb_queues; i < old_nb_queues; i++) + (*dev->dev_ops->queue_release)(dev, i); + + /* Re allocate memory to store queue configuration */ + queues_cfg = dev->data->queues_cfg; + queues_cfg = rte_realloc(queues_cfg, + sizeof(queues_cfg[0]) * nb_queues, + RTE_CACHE_LINE_SIZE); + if (queues_cfg == NULL) { + RTE_EDEV_LOG_ERR("failed to realloc queue cfg memory," + " nb_queues %u", nb_queues); + return -(ENOMEM); + } + dev->data->queues_cfg = queues_cfg; + + if (nb_queues > old_nb_queues) { + uint8_t new_qs = nb_queues - old_nb_queues; + + memset(queues_cfg + old_nb_queues, 0, + sizeof(queues_cfg[0]) * new_qs); + } + } else if (dev->data->queues_cfg != NULL && nb_queues == 0) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP); + + for (i = nb_queues; i < old_nb_queues; i++) + (*dev->dev_ops->queue_release)(dev, i); + } + + dev->data->nb_queues = nb_queues; + return 0; +} + +#define EVENT_QUEUE_SERVICE_PRIORITY_INVALID (0xdead) + +static inline int +rte_event_dev_port_config(struct rte_eventdev *dev, uint8_t nb_ports) +{ + uint8_t old_nb_ports = dev->data->nb_ports; + void **ports; + uint16_t *links_map; + struct rte_event_port_conf *ports_cfg; + unsigned int i; + + RTE_EDEV_LOG_DEBUG("Setup %d ports on device %u", nb_ports, + dev->data->dev_id); + + /* First time configuration */ + if (dev->data->ports == NULL && nb_ports != 0) { + dev->data->ports = rte_zmalloc_socket("eventdev->data->ports", + sizeof(dev->data->ports[0]) * nb_ports, + RTE_CACHE_LINE_SIZE, dev->data->socket_id); + if (dev->data->ports == NULL) { + dev->data->nb_ports = 0; + RTE_EDEV_LOG_ERR("failed to get mem for port meta data," + "nb_ports %u", nb_ports); + return -(ENOMEM); + } + + /* Allocate memory to store port configurations */ + dev->data->ports_cfg = + rte_zmalloc_socket("eventdev->ports_cfg", + sizeof(dev->data->ports_cfg[0]) * nb_ports, + RTE_CACHE_LINE_SIZE, dev->data->socket_id); + if (dev->data->ports_cfg == NULL) { + dev->data->nb_ports = 0; + RTE_EDEV_LOG_ERR("failed to get mem for port cfg," + "nb_ports %u", nb_ports); + return -(ENOMEM); + } + + /* Allocate memory to store queue to port link connection */ + dev->data->links_map = + rte_zmalloc_socket("eventdev->links_map", + sizeof(dev->data->links_map[0]) * nb_ports * + RTE_EVENT_MAX_QUEUES_PER_DEV, + RTE_CACHE_LINE_SIZE, dev->data->socket_id); + if (dev->data->links_map == NULL) { + dev->data->nb_ports = 0; + RTE_EDEV_LOG_ERR("failed to get mem for port_map area," + "nb_ports %u", nb_ports); + return -(ENOMEM); + } + for (i = 0; i < nb_ports * RTE_EVENT_MAX_QUEUES_PER_DEV; i++) + dev->data->links_map[i] = + EVENT_QUEUE_SERVICE_PRIORITY_INVALID; + } else if (dev->data->ports != NULL && nb_ports != 0) {/* re-config */ + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_release, -ENOTSUP); + + ports = dev->data->ports; + ports_cfg = dev->data->ports_cfg; + links_map = dev->data->links_map; + + for (i = nb_ports; i < old_nb_ports; i++) + (*dev->dev_ops->port_release)(ports[i]); + + /* Realloc memory for ports */ + ports = rte_realloc(ports, sizeof(ports[0]) * nb_ports, + RTE_CACHE_LINE_SIZE); + if (ports == NULL) { + RTE_EDEV_LOG_ERR("failed to realloc port meta data," + " nb_ports %u", nb_ports); + return -(ENOMEM); + } + + /* Realloc memory for ports_cfg */ + ports_cfg = rte_realloc(ports_cfg, + sizeof(ports_cfg[0]) * nb_ports, + RTE_CACHE_LINE_SIZE); + if (ports_cfg == NULL) { + RTE_EDEV_LOG_ERR("failed to realloc port cfg mem," + " nb_ports %u", nb_ports); + return -(ENOMEM); + } + + /* Realloc memory to store queue to port link connection */ + links_map = rte_realloc(links_map, + sizeof(dev->data->links_map[0]) * nb_ports * + RTE_EVENT_MAX_QUEUES_PER_DEV, + RTE_CACHE_LINE_SIZE); + if (links_map == NULL) { + dev->data->nb_ports = 0; + RTE_EDEV_LOG_ERR("failed to realloc mem for port_map," + "nb_ports %u", nb_ports); + return -(ENOMEM); + } + + if (nb_ports > old_nb_ports) { + uint8_t new_ps = nb_ports - old_nb_ports; + unsigned int old_links_map_end = + old_nb_ports * RTE_EVENT_MAX_QUEUES_PER_DEV; + unsigned int links_map_end = + nb_ports * RTE_EVENT_MAX_QUEUES_PER_DEV; + + memset(ports + old_nb_ports, 0, + sizeof(ports[0]) * new_ps); + memset(ports_cfg + old_nb_ports, 0, + sizeof(ports_cfg[0]) * new_ps); + for (i = old_links_map_end; i < links_map_end; i++) + links_map[i] = + EVENT_QUEUE_SERVICE_PRIORITY_INVALID; + } + + dev->data->ports = ports; + dev->data->ports_cfg = ports_cfg; + dev->data->links_map = links_map; + } else if (dev->data->ports != NULL && nb_ports == 0) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_release, -ENOTSUP); + + ports = dev->data->ports; + for (i = nb_ports; i < old_nb_ports; i++) + (*dev->dev_ops->port_release)(ports[i]); + } + + dev->data->nb_ports = nb_ports; + return 0; +} + +int +rte_event_dev_configure(uint8_t dev_id, + const struct rte_event_dev_config *dev_conf) +{ + struct rte_eventdev *dev; + struct rte_event_dev_info info; + int diag; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP); + + if (dev->data->dev_started) { + RTE_EDEV_LOG_ERR( + "device %d must be stopped to allow configuration", dev_id); + return -EBUSY; + } + + if (dev_conf == NULL) + return -EINVAL; + + (*dev->dev_ops->dev_infos_get)(dev, &info); + + /* Check dequeue_timeout_ns value is in limit */ + if (!(dev_conf->event_dev_cfg & RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT)) { + if (dev_conf->dequeue_timeout_ns && + (dev_conf->dequeue_timeout_ns < info.min_dequeue_timeout_ns + || dev_conf->dequeue_timeout_ns > + info.max_dequeue_timeout_ns)) { + RTE_EDEV_LOG_ERR("dev%d invalid dequeue_timeout_ns=%d" + " min_dequeue_timeout_ns=%d max_dequeue_timeout_ns=%d", + dev_id, dev_conf->dequeue_timeout_ns, + info.min_dequeue_timeout_ns, + info.max_dequeue_timeout_ns); + return -EINVAL; + } + } + + /* Check nb_events_limit is in limit */ + if (dev_conf->nb_events_limit > info.max_num_events) { + RTE_EDEV_LOG_ERR("dev%d nb_events_limit=%d > max_num_events=%d", + dev_id, dev_conf->nb_events_limit, info.max_num_events); + return -EINVAL; + } + + /* Check nb_event_queues is in limit */ + if (!dev_conf->nb_event_queues) { + RTE_EDEV_LOG_ERR("dev%d nb_event_queues cannot be zero", + dev_id); + return -EINVAL; + } + if (dev_conf->nb_event_queues > info.max_event_queues) { + RTE_EDEV_LOG_ERR("%d nb_event_queues=%d > max_event_queues=%d", + dev_id, dev_conf->nb_event_queues, info.max_event_queues); + return -EINVAL; + } + + /* Check nb_event_ports is in limit */ + if (!dev_conf->nb_event_ports) { + RTE_EDEV_LOG_ERR("dev%d nb_event_ports cannot be zero", dev_id); + return -EINVAL; + } + if (dev_conf->nb_event_ports > info.max_event_ports) { + RTE_EDEV_LOG_ERR("id%d nb_event_ports=%d > max_event_ports= %d", + dev_id, dev_conf->nb_event_ports, info.max_event_ports); + return -EINVAL; + } + + /* Check nb_event_queue_flows is in limit */ + if (!dev_conf->nb_event_queue_flows) { + RTE_EDEV_LOG_ERR("dev%d nb_flows cannot be zero", dev_id); + return -EINVAL; + } + if (dev_conf->nb_event_queue_flows > info.max_event_queue_flows) { + RTE_EDEV_LOG_ERR("dev%d nb_flows=%x > max_flows=%x", + dev_id, dev_conf->nb_event_queue_flows, + info.max_event_queue_flows); + return -EINVAL; + } + + /* Check nb_event_port_dequeue_depth is in limit */ + if (!dev_conf->nb_event_port_dequeue_depth) { + RTE_EDEV_LOG_ERR("dev%d nb_dequeue_depth cannot be zero", + dev_id); + return -EINVAL; + } + if ((info.event_dev_cap & RTE_EVENT_DEV_CAP_BURST_MODE) && + (dev_conf->nb_event_port_dequeue_depth > + info.max_event_port_dequeue_depth)) { + RTE_EDEV_LOG_ERR("dev%d nb_dq_depth=%d > max_dq_depth=%d", + dev_id, dev_conf->nb_event_port_dequeue_depth, + info.max_event_port_dequeue_depth); + return -EINVAL; + } + + /* Check nb_event_port_enqueue_depth is in limit */ + if (!dev_conf->nb_event_port_enqueue_depth) { + RTE_EDEV_LOG_ERR("dev%d nb_enqueue_depth cannot be zero", + dev_id); + return -EINVAL; + } + if ((info.event_dev_cap & RTE_EVENT_DEV_CAP_BURST_MODE) && + (dev_conf->nb_event_port_enqueue_depth > + info.max_event_port_enqueue_depth)) { + RTE_EDEV_LOG_ERR("dev%d nb_enq_depth=%d > max_enq_depth=%d", + dev_id, dev_conf->nb_event_port_enqueue_depth, + info.max_event_port_enqueue_depth); + return -EINVAL; + } + + /* Copy the dev_conf parameter into the dev structure */ + memcpy(&dev->data->dev_conf, dev_conf, sizeof(dev->data->dev_conf)); + + /* Setup new number of queues and reconfigure device. */ + diag = rte_event_dev_queue_config(dev, dev_conf->nb_event_queues); + if (diag != 0) { + RTE_EDEV_LOG_ERR("dev%d rte_event_dev_queue_config = %d", + dev_id, diag); + return diag; + } + + /* Setup new number of ports and reconfigure device. */ + diag = rte_event_dev_port_config(dev, dev_conf->nb_event_ports); + if (diag != 0) { + rte_event_dev_queue_config(dev, 0); + RTE_EDEV_LOG_ERR("dev%d rte_event_dev_port_config = %d", + dev_id, diag); + return diag; + } + + /* Configure the device */ + diag = (*dev->dev_ops->dev_configure)(dev); + if (diag != 0) { + RTE_EDEV_LOG_ERR("dev%d dev_configure = %d", dev_id, diag); + rte_event_dev_queue_config(dev, 0); + rte_event_dev_port_config(dev, 0); + } + + dev->data->event_dev_cap = info.event_dev_cap; + return diag; +} + +static inline int +is_valid_queue(struct rte_eventdev *dev, uint8_t queue_id) +{ + if (queue_id < dev->data->nb_queues && queue_id < + RTE_EVENT_MAX_QUEUES_PER_DEV) + return 1; + else + return 0; +} + +int +rte_event_queue_default_conf_get(uint8_t dev_id, uint8_t queue_id, + struct rte_event_queue_conf *queue_conf) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + if (queue_conf == NULL) + return -EINVAL; + + if (!is_valid_queue(dev, queue_id)) { + RTE_EDEV_LOG_ERR("Invalid queue_id=%" PRIu8, queue_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_def_conf, -ENOTSUP); + memset(queue_conf, 0, sizeof(struct rte_event_queue_conf)); + (*dev->dev_ops->queue_def_conf)(dev, queue_id, queue_conf); + return 0; +} + +static inline int +is_valid_atomic_queue_conf(const struct rte_event_queue_conf *queue_conf) +{ + if (queue_conf && + !(queue_conf->event_queue_cfg & + RTE_EVENT_QUEUE_CFG_SINGLE_LINK) && + ((queue_conf->event_queue_cfg & + RTE_EVENT_QUEUE_CFG_ALL_TYPES) || + (queue_conf->schedule_type + == RTE_SCHED_TYPE_ATOMIC) + )) + return 1; + else + return 0; +} + +static inline int +is_valid_ordered_queue_conf(const struct rte_event_queue_conf *queue_conf) +{ + if (queue_conf && + !(queue_conf->event_queue_cfg & + RTE_EVENT_QUEUE_CFG_SINGLE_LINK) && + ((queue_conf->event_queue_cfg & + RTE_EVENT_QUEUE_CFG_ALL_TYPES) || + (queue_conf->schedule_type + == RTE_SCHED_TYPE_ORDERED) + )) + return 1; + else + return 0; +} + + +int +rte_event_queue_setup(uint8_t dev_id, uint8_t queue_id, + const struct rte_event_queue_conf *queue_conf) +{ + struct rte_eventdev *dev; + struct rte_event_queue_conf def_conf; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + if (!is_valid_queue(dev, queue_id)) { + RTE_EDEV_LOG_ERR("Invalid queue_id=%" PRIu8, queue_id); + return -EINVAL; + } + + /* Check nb_atomic_flows limit */ + if (is_valid_atomic_queue_conf(queue_conf)) { + if (queue_conf->nb_atomic_flows == 0 || + queue_conf->nb_atomic_flows > + dev->data->dev_conf.nb_event_queue_flows) { + RTE_EDEV_LOG_ERR( + "dev%d queue%d Invalid nb_atomic_flows=%d max_flows=%d", + dev_id, queue_id, queue_conf->nb_atomic_flows, + dev->data->dev_conf.nb_event_queue_flows); + return -EINVAL; + } + } + + /* Check nb_atomic_order_sequences limit */ + if (is_valid_ordered_queue_conf(queue_conf)) { + if (queue_conf->nb_atomic_order_sequences == 0 || + queue_conf->nb_atomic_order_sequences > + dev->data->dev_conf.nb_event_queue_flows) { + RTE_EDEV_LOG_ERR( + "dev%d queue%d Invalid nb_atomic_order_seq=%d max_flows=%d", + dev_id, queue_id, queue_conf->nb_atomic_order_sequences, + dev->data->dev_conf.nb_event_queue_flows); + return -EINVAL; + } + } + + if (dev->data->dev_started) { + RTE_EDEV_LOG_ERR( + "device %d must be stopped to allow queue setup", dev_id); + return -EBUSY; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -ENOTSUP); + + if (queue_conf == NULL) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_def_conf, + -ENOTSUP); + (*dev->dev_ops->queue_def_conf)(dev, queue_id, &def_conf); + queue_conf = &def_conf; + } + + dev->data->queues_cfg[queue_id] = *queue_conf; + return (*dev->dev_ops->queue_setup)(dev, queue_id, queue_conf); +} + +static inline int +is_valid_port(struct rte_eventdev *dev, uint8_t port_id) +{ + if (port_id < dev->data->nb_ports) + return 1; + else + return 0; +} + +int +rte_event_port_default_conf_get(uint8_t dev_id, uint8_t port_id, + struct rte_event_port_conf *port_conf) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + if (port_conf == NULL) + return -EINVAL; + + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + return -EINVAL; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_def_conf, -ENOTSUP); + memset(port_conf, 0, sizeof(struct rte_event_port_conf)); + (*dev->dev_ops->port_def_conf)(dev, port_id, port_conf); + return 0; +} + +int +rte_event_port_setup(uint8_t dev_id, uint8_t port_id, + const struct rte_event_port_conf *port_conf) +{ + struct rte_eventdev *dev; + struct rte_event_port_conf def_conf; + int diag; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + return -EINVAL; + } + + /* Check new_event_threshold limit */ + if ((port_conf && !port_conf->new_event_threshold) || + (port_conf && port_conf->new_event_threshold > + dev->data->dev_conf.nb_events_limit)) { + RTE_EDEV_LOG_ERR( + "dev%d port%d Invalid event_threshold=%d nb_events_limit=%d", + dev_id, port_id, port_conf->new_event_threshold, + dev->data->dev_conf.nb_events_limit); + return -EINVAL; + } + + /* Check dequeue_depth limit */ + if ((port_conf && !port_conf->dequeue_depth) || + (port_conf && port_conf->dequeue_depth > + dev->data->dev_conf.nb_event_port_dequeue_depth)) { + RTE_EDEV_LOG_ERR( + "dev%d port%d Invalid dequeue depth=%d max_dequeue_depth=%d", + dev_id, port_id, port_conf->dequeue_depth, + dev->data->dev_conf.nb_event_port_dequeue_depth); + return -EINVAL; + } + + /* Check enqueue_depth limit */ + if ((port_conf && !port_conf->enqueue_depth) || + (port_conf && port_conf->enqueue_depth > + dev->data->dev_conf.nb_event_port_enqueue_depth)) { + RTE_EDEV_LOG_ERR( + "dev%d port%d Invalid enqueue depth=%d max_enqueue_depth=%d", + dev_id, port_id, port_conf->enqueue_depth, + dev->data->dev_conf.nb_event_port_enqueue_depth); + return -EINVAL; + } + + if (port_conf && port_conf->disable_implicit_release && + !(dev->data->event_dev_cap & + RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE)) { + RTE_EDEV_LOG_ERR( + "dev%d port%d Implicit release disable not supported", + dev_id, port_id); + return -EINVAL; + } + + if (dev->data->dev_started) { + RTE_EDEV_LOG_ERR( + "device %d must be stopped to allow port setup", dev_id); + return -EBUSY; + } + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_setup, -ENOTSUP); + + if (port_conf == NULL) { + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->port_def_conf, + -ENOTSUP); + (*dev->dev_ops->port_def_conf)(dev, port_id, &def_conf); + port_conf = &def_conf; + } + + dev->data->ports_cfg[port_id] = *port_conf; + + diag = (*dev->dev_ops->port_setup)(dev, port_id, port_conf); + + /* Unlink all the queues from this port(default state after setup) */ + if (!diag) + diag = rte_event_port_unlink(dev_id, port_id, NULL, 0); + + if (diag < 0) + return diag; + + return 0; +} + +int +rte_event_dev_attr_get(uint8_t dev_id, uint32_t attr_id, + uint32_t *attr_value) +{ + struct rte_eventdev *dev; + + if (!attr_value) + return -EINVAL; + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + switch (attr_id) { + case RTE_EVENT_DEV_ATTR_PORT_COUNT: + *attr_value = dev->data->nb_ports; + break; + case RTE_EVENT_DEV_ATTR_QUEUE_COUNT: + *attr_value = dev->data->nb_queues; + break; + case RTE_EVENT_DEV_ATTR_STARTED: + *attr_value = dev->data->dev_started; + break; + default: + return -EINVAL; + } + + return 0; +} + +int +rte_event_port_attr_get(uint8_t dev_id, uint8_t port_id, uint32_t attr_id, + uint32_t *attr_value) +{ + struct rte_eventdev *dev; + + if (!attr_value) + return -EINVAL; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + return -EINVAL; + } + + switch (attr_id) { + case RTE_EVENT_PORT_ATTR_ENQ_DEPTH: + *attr_value = dev->data->ports_cfg[port_id].enqueue_depth; + break; + case RTE_EVENT_PORT_ATTR_DEQ_DEPTH: + *attr_value = dev->data->ports_cfg[port_id].dequeue_depth; + break; + case RTE_EVENT_PORT_ATTR_NEW_EVENT_THRESHOLD: + *attr_value = dev->data->ports_cfg[port_id].new_event_threshold; + break; + default: + return -EINVAL; + }; + return 0; +} + +int +rte_event_queue_attr_get(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id, + uint32_t *attr_value) +{ + struct rte_event_queue_conf *conf; + struct rte_eventdev *dev; + + if (!attr_value) + return -EINVAL; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + if (!is_valid_queue(dev, queue_id)) { + RTE_EDEV_LOG_ERR("Invalid queue_id=%" PRIu8, queue_id); + return -EINVAL; + } + + conf = &dev->data->queues_cfg[queue_id]; + + switch (attr_id) { + case RTE_EVENT_QUEUE_ATTR_PRIORITY: + *attr_value = RTE_EVENT_DEV_PRIORITY_NORMAL; + if (dev->data->event_dev_cap & RTE_EVENT_DEV_CAP_QUEUE_QOS) + *attr_value = conf->priority; + break; + case RTE_EVENT_QUEUE_ATTR_NB_ATOMIC_FLOWS: + *attr_value = conf->nb_atomic_flows; + break; + case RTE_EVENT_QUEUE_ATTR_NB_ATOMIC_ORDER_SEQUENCES: + *attr_value = conf->nb_atomic_order_sequences; + break; + case RTE_EVENT_QUEUE_ATTR_EVENT_QUEUE_CFG: + *attr_value = conf->event_queue_cfg; + break; + case RTE_EVENT_QUEUE_ATTR_SCHEDULE_TYPE: + if (conf->event_queue_cfg & RTE_EVENT_QUEUE_CFG_ALL_TYPES) + return -EOVERFLOW; + + *attr_value = conf->schedule_type; + break; + default: + return -EINVAL; + }; + return 0; +} + +int +rte_event_port_link(uint8_t dev_id, uint8_t port_id, + const uint8_t queues[], const uint8_t priorities[], + uint16_t nb_links) +{ + struct rte_eventdev *dev; + uint8_t queues_list[RTE_EVENT_MAX_QUEUES_PER_DEV]; + uint8_t priorities_list[RTE_EVENT_MAX_QUEUES_PER_DEV]; + uint16_t *links_map; + int i, diag; + + RTE_EVENTDEV_VALID_DEVID_OR_ERRNO_RET(dev_id, -EINVAL, 0); + dev = &rte_eventdevs[dev_id]; + + if (*dev->dev_ops->port_link == NULL) { + RTE_PMD_DEBUG_TRACE("Function not supported\n"); + rte_errno = -ENOTSUP; + return 0; + } + + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + rte_errno = -EINVAL; + return 0; + } + + if (queues == NULL) { + for (i = 0; i < dev->data->nb_queues; i++) + queues_list[i] = i; + + queues = queues_list; + nb_links = dev->data->nb_queues; + } + + if (priorities == NULL) { + for (i = 0; i < nb_links; i++) + priorities_list[i] = RTE_EVENT_DEV_PRIORITY_NORMAL; + + priorities = priorities_list; + } + + for (i = 0; i < nb_links; i++) + if (queues[i] >= dev->data->nb_queues) { + rte_errno = -EINVAL; + return 0; + } + + diag = (*dev->dev_ops->port_link)(dev, dev->data->ports[port_id], + queues, priorities, nb_links); + if (diag < 0) + return diag; + + links_map = dev->data->links_map; + /* Point links_map to this port specific area */ + links_map += (port_id * RTE_EVENT_MAX_QUEUES_PER_DEV); + for (i = 0; i < diag; i++) + links_map[queues[i]] = (uint8_t)priorities[i]; + + return diag; +} + +int +rte_event_port_unlink(uint8_t dev_id, uint8_t port_id, + uint8_t queues[], uint16_t nb_unlinks) +{ + struct rte_eventdev *dev; + uint8_t all_queues[RTE_EVENT_MAX_QUEUES_PER_DEV]; + int i, diag, j; + uint16_t *links_map; + + RTE_EVENTDEV_VALID_DEVID_OR_ERRNO_RET(dev_id, -EINVAL, 0); + dev = &rte_eventdevs[dev_id]; + + if (*dev->dev_ops->port_unlink == NULL) { + RTE_PMD_DEBUG_TRACE("Function not supported\n"); + rte_errno = -ENOTSUP; + return 0; + } + + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + rte_errno = -EINVAL; + return 0; + } + + links_map = dev->data->links_map; + /* Point links_map to this port specific area */ + links_map += (port_id * RTE_EVENT_MAX_QUEUES_PER_DEV); + + if (queues == NULL) { + j = 0; + for (i = 0; i < dev->data->nb_queues; i++) { + if (links_map[i] != + EVENT_QUEUE_SERVICE_PRIORITY_INVALID) { + all_queues[j] = i; + j++; + } + } + queues = all_queues; + } else { + for (j = 0; j < nb_unlinks; j++) { + if (links_map[queues[j]] == + EVENT_QUEUE_SERVICE_PRIORITY_INVALID) + break; + } + } + + nb_unlinks = j; + for (i = 0; i < nb_unlinks; i++) + if (queues[i] >= dev->data->nb_queues) { + rte_errno = -EINVAL; + return 0; + } + + diag = (*dev->dev_ops->port_unlink)(dev, dev->data->ports[port_id], + queues, nb_unlinks); + + if (diag < 0) + return diag; + + for (i = 0; i < diag; i++) + links_map[queues[i]] = EVENT_QUEUE_SERVICE_PRIORITY_INVALID; + + return diag; +} + +int +rte_event_port_links_get(uint8_t dev_id, uint8_t port_id, + uint8_t queues[], uint8_t priorities[]) +{ + struct rte_eventdev *dev; + uint16_t *links_map; + int i, count = 0; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + if (!is_valid_port(dev, port_id)) { + RTE_EDEV_LOG_ERR("Invalid port_id=%" PRIu8, port_id); + return -EINVAL; + } + + links_map = dev->data->links_map; + /* Point links_map to this port specific area */ + links_map += (port_id * RTE_EVENT_MAX_QUEUES_PER_DEV); + for (i = 0; i < dev->data->nb_queues; i++) { + if (links_map[i] != EVENT_QUEUE_SERVICE_PRIORITY_INVALID) { + queues[count] = i; + priorities[count] = (uint8_t)links_map[i]; + ++count; + } + } + return count; +} + +int +rte_event_dequeue_timeout_ticks(uint8_t dev_id, uint64_t ns, + uint64_t *timeout_ticks) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timeout_ticks, -ENOTSUP); + + if (timeout_ticks == NULL) + return -EINVAL; + + return (*dev->dev_ops->timeout_ticks)(dev, ns, timeout_ticks); +} + +int +rte_event_dev_service_id_get(uint8_t dev_id, uint32_t *service_id) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + if (service_id == NULL) + return -EINVAL; + + if (dev->data->service_inited) + *service_id = dev->data->service_id; + + return dev->data->service_inited ? 0 : -ESRCH; +} + +int +rte_event_dev_dump(uint8_t dev_id, FILE *f) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dump, -ENOTSUP); + + (*dev->dev_ops->dump)(dev, f); + return 0; + +} + +static int +xstats_get_count(uint8_t dev_id, enum rte_event_dev_xstats_mode mode, + uint8_t queue_port_id) +{ + struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + if (dev->dev_ops->xstats_get_names != NULL) + return (*dev->dev_ops->xstats_get_names)(dev, mode, + queue_port_id, + NULL, NULL, 0); + return 0; +} + +int +rte_event_dev_xstats_names_get(uint8_t dev_id, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + struct rte_event_dev_xstats_name *xstats_names, + unsigned int *ids, unsigned int size) +{ + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -ENODEV); + const int cnt_expected_entries = xstats_get_count(dev_id, mode, + queue_port_id); + if (xstats_names == NULL || cnt_expected_entries < 0 || + (int)size < cnt_expected_entries) + return cnt_expected_entries; + + /* dev_id checked above */ + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + if (dev->dev_ops->xstats_get_names != NULL) + return (*dev->dev_ops->xstats_get_names)(dev, mode, + queue_port_id, xstats_names, ids, size); + + return -ENOTSUP; +} + +/* retrieve eventdev extended statistics */ +int +rte_event_dev_xstats_get(uint8_t dev_id, enum rte_event_dev_xstats_mode mode, + uint8_t queue_port_id, const unsigned int ids[], + uint64_t values[], unsigned int n) +{ + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -ENODEV); + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + /* implemented by the driver */ + if (dev->dev_ops->xstats_get != NULL) + return (*dev->dev_ops->xstats_get)(dev, mode, queue_port_id, + ids, values, n); + return -ENOTSUP; +} + +uint64_t +rte_event_dev_xstats_by_name_get(uint8_t dev_id, const char *name, + unsigned int *id) +{ + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, 0); + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + unsigned int temp = -1; + + if (id != NULL) + *id = (unsigned int)-1; + else + id = &temp; /* ensure driver never gets a NULL value */ + + /* implemented by driver */ + if (dev->dev_ops->xstats_get_by_name != NULL) + return (*dev->dev_ops->xstats_get_by_name)(dev, name, id); + return -ENOTSUP; +} + +int rte_event_dev_xstats_reset(uint8_t dev_id, + enum rte_event_dev_xstats_mode mode, int16_t queue_port_id, + const uint32_t ids[], uint32_t nb_ids) +{ + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + if (dev->dev_ops->xstats_reset != NULL) + return (*dev->dev_ops->xstats_reset)(dev, mode, queue_port_id, + ids, nb_ids); + return -ENOTSUP; +} + +int rte_event_dev_selftest(uint8_t dev_id) +{ + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + if (dev->dev_ops->dev_selftest != NULL) + return (*dev->dev_ops->dev_selftest)(); + return -ENOTSUP; +} + +int +rte_event_dev_start(uint8_t dev_id) +{ + struct rte_eventdev *dev; + int diag; + + RTE_EDEV_LOG_DEBUG("Start dev_id=%" PRIu8, dev_id); + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_start, -ENOTSUP); + + if (dev->data->dev_started != 0) { + RTE_EDEV_LOG_ERR("Device with dev_id=%" PRIu8 "already started", + dev_id); + return 0; + } + + diag = (*dev->dev_ops->dev_start)(dev); + if (diag == 0) + dev->data->dev_started = 1; + else + return diag; + + return 0; +} + +int +rte_event_dev_stop_flush_callback_register(uint8_t dev_id, + eventdev_stop_flush_t callback, void *userdata) +{ + struct rte_eventdev *dev; + + RTE_EDEV_LOG_DEBUG("Stop flush register dev_id=%" PRIu8, dev_id); + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + + dev->dev_ops->dev_stop_flush = callback; + dev->data->dev_stop_flush_arg = userdata; + + return 0; +} + +void +rte_event_dev_stop(uint8_t dev_id) +{ + struct rte_eventdev *dev; + + RTE_EDEV_LOG_DEBUG("Stop dev_id=%" PRIu8, dev_id); + + RTE_EVENTDEV_VALID_DEVID_OR_RET(dev_id); + dev = &rte_eventdevs[dev_id]; + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_stop); + + if (dev->data->dev_started == 0) { + RTE_EDEV_LOG_ERR("Device with dev_id=%" PRIu8 "already stopped", + dev_id); + return; + } + + dev->data->dev_started = 0; + (*dev->dev_ops->dev_stop)(dev); +} + +int +rte_event_dev_close(uint8_t dev_id) +{ + struct rte_eventdev *dev; + + RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_eventdevs[dev_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP); + + /* Device must be stopped before it can be closed */ + if (dev->data->dev_started == 1) { + RTE_EDEV_LOG_ERR("Device %u must be stopped before closing", + dev_id); + return -EBUSY; + } + + return (*dev->dev_ops->dev_close)(dev); +} + +static inline int +rte_eventdev_data_alloc(uint8_t dev_id, struct rte_eventdev_data **data, + int socket_id) +{ + char mz_name[RTE_EVENTDEV_NAME_MAX_LEN]; + const struct rte_memzone *mz; + int n; + + /* Generate memzone name */ + n = snprintf(mz_name, sizeof(mz_name), "rte_eventdev_data_%u", dev_id); + if (n >= (int)sizeof(mz_name)) + return -EINVAL; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + mz = rte_memzone_reserve(mz_name, + sizeof(struct rte_eventdev_data), + socket_id, 0); + } else + mz = rte_memzone_lookup(mz_name); + + if (mz == NULL) + return -ENOMEM; + + *data = mz->addr; + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + memset(*data, 0, sizeof(struct rte_eventdev_data)); + + return 0; +} + +static inline uint8_t +rte_eventdev_find_free_device_index(void) +{ + uint8_t dev_id; + + for (dev_id = 0; dev_id < RTE_EVENT_MAX_DEVS; dev_id++) { + if (rte_eventdevs[dev_id].attached == + RTE_EVENTDEV_DETACHED) + return dev_id; + } + return RTE_EVENT_MAX_DEVS; +} + +struct rte_eventdev * +rte_event_pmd_allocate(const char *name, int socket_id) +{ + struct rte_eventdev *eventdev; + uint8_t dev_id; + + if (rte_event_pmd_get_named_dev(name) != NULL) { + RTE_EDEV_LOG_ERR("Event device with name %s already " + "allocated!", name); + return NULL; + } + + dev_id = rte_eventdev_find_free_device_index(); + if (dev_id == RTE_EVENT_MAX_DEVS) { + RTE_EDEV_LOG_ERR("Reached maximum number of event devices"); + return NULL; + } + + eventdev = &rte_eventdevs[dev_id]; + + if (eventdev->data == NULL) { + struct rte_eventdev_data *eventdev_data = NULL; + + int retval = rte_eventdev_data_alloc(dev_id, &eventdev_data, + socket_id); + + if (retval < 0 || eventdev_data == NULL) + return NULL; + + eventdev->data = eventdev_data; + + snprintf(eventdev->data->name, RTE_EVENTDEV_NAME_MAX_LEN, + "%s", name); + + eventdev->data->dev_id = dev_id; + eventdev->data->socket_id = socket_id; + eventdev->data->dev_started = 0; + + eventdev->attached = RTE_EVENTDEV_ATTACHED; + + eventdev_globals.nb_devs++; + } + + return eventdev; +} + +int +rte_event_pmd_release(struct rte_eventdev *eventdev) +{ + int ret; + char mz_name[RTE_EVENTDEV_NAME_MAX_LEN]; + const struct rte_memzone *mz; + + if (eventdev == NULL) + return -EINVAL; + + eventdev->attached = RTE_EVENTDEV_DETACHED; + eventdev_globals.nb_devs--; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_free(eventdev->data->dev_private); + + /* Generate memzone name */ + ret = snprintf(mz_name, sizeof(mz_name), "rte_eventdev_data_%u", + eventdev->data->dev_id); + if (ret >= (int)sizeof(mz_name)) + return -EINVAL; + + mz = rte_memzone_lookup(mz_name); + if (mz == NULL) + return -ENOMEM; + + ret = rte_memzone_free(mz); + if (ret) + return ret; + } + + eventdev->data = NULL; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.h b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.h new file mode 100644 index 00000000..b6fd6ee7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev.h @@ -0,0 +1,1920 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc. + * Copyright(c) 2016-2018 Intel Corporation. + * Copyright 2016 NXP + * All rights reserved. + */ + +#ifndef _RTE_EVENTDEV_H_ +#define _RTE_EVENTDEV_H_ + +/** + * @file + * + * RTE Event Device API + * + * In a polling model, lcores poll ethdev ports and associated rx queues + * directly to look for packet. In an event driven model, by contrast, lcores + * call the scheduler that selects packets for them based on programmer + * specified criteria. Eventdev library adds support for event driven + * programming model, which offer applications automatic multicore scaling, + * dynamic load balancing, pipelining, packet ingress order maintenance and + * synchronization services to simplify application packet processing. + * + * The Event Device API is composed of two parts: + * + * - The application-oriented Event API that includes functions to setup + * an event device (configure it, setup its queues, ports and start it), to + * establish the link between queues to port and to receive events, and so on. + * + * - The driver-oriented Event API that exports a function allowing + * an event poll Mode Driver (PMD) to simultaneously register itself as + * an event device driver. + * + * Event device components: + * + * +-----------------+ + * | +-------------+ | + * +-------+ | | flow 0 | | + * |Packet | | +-------------+ | + * |event | | +-------------+ | + * | | | | flow 1 | |port_link(port0, queue0) + * +-------+ | +-------------+ | | +--------+ + * +-------+ | +-------------+ o-----v-----o |dequeue +------+ + * |Crypto | | | flow n | | | event +------->|Core 0| + * |work | | +-------------+ o----+ | port 0 | | | + * |done ev| | event queue 0 | | +--------+ +------+ + * +-------+ +-----------------+ | + * +-------+ | + * |Timer | +-----------------+ | +--------+ + * |expiry | | +-------------+ | +------o |dequeue +------+ + * |event | | | flow 0 | o-----------o event +------->|Core 1| + * +-------+ | +-------------+ | +----o port 1 | | | + * Event enqueue | +-------------+ | | +--------+ +------+ + * o-------------> | | flow 1 | | | + * enqueue( | +-------------+ | | + * queue_id, | | | +--------+ +------+ + * flow_id, | +-------------+ | | | |dequeue |Core 2| + * sched_type, | | flow n | o-----------o event +------->| | + * event_type, | +-------------+ | | | port 2 | +------+ + * subev_type, | event queue 1 | | +--------+ + * event) +-----------------+ | +--------+ + * | | |dequeue +------+ + * +-------+ +-----------------+ | | event +------->|Core n| + * |Core | | +-------------+ o-----------o port n | | | + * |(SW) | | | flow 0 | | | +--------+ +--+---+ + * |event | | +-------------+ | | | + * +-------+ | +-------------+ | | | + * ^ | | flow 1 | | | | + * | | +-------------+ o------+ | + * | | +-------------+ | | + * | | | flow n | | | + * | | +-------------+ | | + * | | event queue n | | + * | +-----------------+ | + * | | + * +-----------------------------------------------------------+ + * + * Event device: A hardware or software-based event scheduler. + * + * Event: A unit of scheduling that encapsulates a packet or other datatype + * like SW generated event from the CPU, Crypto work completion notification, + * Timer expiry event notification etc as well as metadata. + * The metadata includes flow ID, scheduling type, event priority, event_type, + * sub_event_type etc. + * + * Event queue: A queue containing events that are scheduled by the event dev. + * An event queue contains events of different flows associated with scheduling + * types, such as atomic, ordered, or parallel. + * + * Event port: An application's interface into the event dev for enqueue and + * dequeue operations. Each event port can be linked with one or more + * event queues for dequeue operations. + * + * By default, all the functions of the Event Device API exported by a PMD + * are lock-free functions which assume to not be invoked in parallel on + * different logical cores to work on the same target object. For instance, + * the dequeue function of a PMD cannot be invoked in parallel on two logical + * cores to operates on same event port. Of course, this function + * can be invoked in parallel by different logical cores on different ports. + * It is the responsibility of the upper level application to enforce this rule. + * + * In all functions of the Event API, the Event device is + * designated by an integer >= 0 named the device identifier *dev_id* + * + * At the Event driver level, Event devices are represented by a generic + * data structure of type *rte_event_dev*. + * + * Event devices are dynamically registered during the PCI/SoC device probing + * phase performed at EAL initialization time. + * When an Event device is being probed, a *rte_event_dev* structure and + * a new device identifier are allocated for that device. Then, the + * event_dev_init() function supplied by the Event driver matching the probed + * device is invoked to properly initialize the device. + * + * The role of the device init function consists of resetting the hardware or + * software event driver implementations. + * + * If the device init operation is successful, the correspondence between + * the device identifier assigned to the new device and its associated + * *rte_event_dev* structure is effectively registered. + * Otherwise, both the *rte_event_dev* structure and the device identifier are + * freed. + * + * The functions exported by the application Event API to setup a device + * designated by its device identifier must be invoked in the following order: + * - rte_event_dev_configure() + * - rte_event_queue_setup() + * - rte_event_port_setup() + * - rte_event_port_link() + * - rte_event_dev_start() + * + * Then, the application can invoke, in any order, the functions + * exported by the Event API to schedule events, dequeue events, enqueue events, + * change event queue(s) to event port [un]link establishment and so on. + * + * Application may use rte_event_[queue/port]_default_conf_get() to get the + * default configuration to set up an event queue or event port by + * overriding few default values. + * + * If the application wants to change the configuration (i.e. call + * rte_event_dev_configure(), rte_event_queue_setup(), or + * rte_event_port_setup()), it must call rte_event_dev_stop() first to stop the + * device and then do the reconfiguration before calling rte_event_dev_start() + * again. The schedule, enqueue and dequeue functions should not be invoked + * when the device is stopped. + * + * Finally, an application can close an Event device by invoking the + * rte_event_dev_close() function. + * + * Each function of the application Event API invokes a specific function + * of the PMD that controls the target device designated by its device + * identifier. + * + * For this purpose, all device-specific functions of an Event driver are + * supplied through a set of pointers contained in a generic structure of type + * *event_dev_ops*. + * The address of the *event_dev_ops* structure is stored in the *rte_event_dev* + * structure by the device init function of the Event driver, which is + * invoked during the PCI/SoC device probing phase, as explained earlier. + * + * In other words, each function of the Event API simply retrieves the + * *rte_event_dev* structure associated with the device identifier and + * performs an indirect invocation of the corresponding driver function + * supplied in the *event_dev_ops* structure of the *rte_event_dev* structure. + * + * For performance reasons, the address of the fast-path functions of the + * Event driver is not contained in the *event_dev_ops* structure. + * Instead, they are directly stored at the beginning of the *rte_event_dev* + * structure to avoid an extra indirect memory access during their invocation. + * + * RTE event device drivers do not use interrupts for enqueue or dequeue + * operation. Instead, Event drivers export Poll-Mode enqueue and dequeue + * functions to applications. + * + * The events are injected to event device through *enqueue* operation by + * event producers in the system. The typical event producers are ethdev + * subsystem for generating packet events, CPU(SW) for generating events based + * on different stages of application processing, cryptodev for generating + * crypto work completion notification etc + * + * The *dequeue* operation gets one or more events from the event ports. + * The application process the events and send to downstream event queue through + * rte_event_enqueue_burst() if it is an intermediate stage of event processing, + * on the final stage, the application may send to different subsystem like + * ethdev to send the packet/event on the wire using ethdev + * rte_eth_tx_burst() API. + * + * The point at which events are scheduled to ports depends on the device. + * For hardware devices, scheduling occurs asynchronously without any software + * intervention. Software schedulers can either be distributed + * (each worker thread schedules events to its own port) or centralized + * (a dedicated thread schedules to all ports). Distributed software schedulers + * perform the scheduling in rte_event_dequeue_burst(), whereas centralized + * scheduler logic need a dedicated service core for scheduling. + * The RTE_EVENT_DEV_CAP_DISTRIBUTED_SCHED capability flag is not set + * indicates the device is centralized and thus needs a dedicated scheduling + * thread that repeatedly calls software specific scheduling function. + * + * An event driven worker thread has following typical workflow on fastpath: + * \code{.c} + * while (1) { + * rte_event_dequeue_burst(...); + * (event processing) + * rte_event_enqueue_burst(...); + * } + * \endcode + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +struct rte_mbuf; /* we just use mbuf pointers; no need to include rte_mbuf.h */ +struct rte_event; + +/* Event device capability bitmap flags */ +#define RTE_EVENT_DEV_CAP_QUEUE_QOS (1ULL << 0) +/**< Event scheduling prioritization is based on the priority associated with + * each event queue. + * + * @see rte_event_queue_setup() + */ +#define RTE_EVENT_DEV_CAP_EVENT_QOS (1ULL << 1) +/**< Event scheduling prioritization is based on the priority associated with + * each event. Priority of each event is supplied in *rte_event* structure + * on each enqueue operation. + * + * @see rte_event_enqueue_burst() + */ +#define RTE_EVENT_DEV_CAP_DISTRIBUTED_SCHED (1ULL << 2) +/**< Event device operates in distributed scheduling mode. + * In distributed scheduling mode, event scheduling happens in HW or + * rte_event_dequeue_burst() or the combination of these two. + * If the flag is not set then eventdev is centralized and thus needs a + * dedicated service core that acts as a scheduling thread . + * + * @see rte_event_dequeue_burst() + */ +#define RTE_EVENT_DEV_CAP_QUEUE_ALL_TYPES (1ULL << 3) +/**< Event device is capable of enqueuing events of any type to any queue. + * If this capability is not set, the queue only supports events of the + * *RTE_SCHED_TYPE_* type that it was created with. + * + * @see RTE_SCHED_TYPE_* values + */ +#define RTE_EVENT_DEV_CAP_BURST_MODE (1ULL << 4) +/**< Event device is capable of operating in burst mode for enqueue(forward, + * release) and dequeue operation. If this capability is not set, application + * still uses the rte_event_dequeue_burst() and rte_event_enqueue_burst() but + * PMD accepts only one event at a time. + * + * @see rte_event_dequeue_burst() rte_event_enqueue_burst() + */ +#define RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE (1ULL << 5) +/**< Event device ports support disabling the implicit release feature, in + * which the port will release all unreleased events in its dequeue operation. + * If this capability is set and the port is configured with implicit release + * disabled, the application is responsible for explicitly releasing events + * using either the RTE_EVENT_OP_FORWARD or the RTE_EVENT_OP_RELEASE event + * enqueue operations. + * + * @see rte_event_dequeue_burst() rte_event_enqueue_burst() + */ + +#define RTE_EVENT_DEV_CAP_NONSEQ_MODE (1ULL << 6) +/**< Event device is capable of operating in none sequential mode. The path + * of the event is not necessary to be sequential. Application can change + * the path of event at runtime. If the flag is not set, then event each event + * will follow a path from queue 0 to queue 1 to queue 2 etc. If the flag is + * set, events may be sent to queues in any order. If the flag is not set, the + * eventdev will return an error when the application enqueues an event for a + * qid which is not the next in the sequence. + */ + +#define RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK (1ULL << 7) +/**< Event device is capable of configuring the queue/port link at runtime. + * If the flag is not set, the eventdev queue/port link is only can be + * configured during initialization. + */ + +#define RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT (1ULL << 8) +/**< Event device is capable of setting up the link between multiple queue + * with single port. If the flag is not set, the eventdev can only map a + * single queue to each port or map a single queue to many port. + */ + +/* Event device priority levels */ +#define RTE_EVENT_DEV_PRIORITY_HIGHEST 0 +/**< Highest priority expressed across eventdev subsystem + * @see rte_event_queue_setup(), rte_event_enqueue_burst() + * @see rte_event_port_link() + */ +#define RTE_EVENT_DEV_PRIORITY_NORMAL 128 +/**< Normal priority expressed across eventdev subsystem + * @see rte_event_queue_setup(), rte_event_enqueue_burst() + * @see rte_event_port_link() + */ +#define RTE_EVENT_DEV_PRIORITY_LOWEST 255 +/**< Lowest priority expressed across eventdev subsystem + * @see rte_event_queue_setup(), rte_event_enqueue_burst() + * @see rte_event_port_link() + */ + +/** + * Get the total number of event devices that have been successfully + * initialised. + * + * @return + * The total number of usable event devices. + */ +uint8_t +rte_event_dev_count(void); + +/** + * Get the device identifier for the named event device. + * + * @param name + * Event device name to select the event device identifier. + * + * @return + * Returns event device identifier on success. + * - <0: Failure to find named event device. + */ +int +rte_event_dev_get_dev_id(const char *name); + +/** + * Return the NUMA socket to which a device is connected. + * + * @param dev_id + * The identifier of the device. + * @return + * The NUMA socket id to which the device is connected or + * a default of zero if the socket could not be determined. + * -(-EINVAL) dev_id value is out of range. + */ +int +rte_event_dev_socket_id(uint8_t dev_id); + +/** + * Event device information + */ +struct rte_event_dev_info { + const char *driver_name; /**< Event driver name */ + struct rte_device *dev; /**< Device information */ + uint32_t min_dequeue_timeout_ns; + /**< Minimum supported global dequeue timeout(ns) by this device */ + uint32_t max_dequeue_timeout_ns; + /**< Maximum supported global dequeue timeout(ns) by this device */ + uint32_t dequeue_timeout_ns; + /**< Configured global dequeue timeout(ns) for this device */ + uint8_t max_event_queues; + /**< Maximum event_queues supported by this device */ + uint32_t max_event_queue_flows; + /**< Maximum supported flows in an event queue by this device*/ + uint8_t max_event_queue_priority_levels; + /**< Maximum number of event queue priority levels by this device. + * Valid when the device has RTE_EVENT_DEV_CAP_QUEUE_QOS capability + */ + uint8_t max_event_priority_levels; + /**< Maximum number of event priority levels by this device. + * Valid when the device has RTE_EVENT_DEV_CAP_EVENT_QOS capability + */ + uint8_t max_event_ports; + /**< Maximum number of event ports supported by this device */ + uint8_t max_event_port_dequeue_depth; + /**< Maximum number of events can be dequeued at a time from an + * event port by this device. + * A device that does not support bulk dequeue will set this as 1. + */ + uint32_t max_event_port_enqueue_depth; + /**< Maximum number of events can be enqueued at a time from an + * event port by this device. + * A device that does not support bulk enqueue will set this as 1. + */ + int32_t max_num_events; + /**< A *closed system* event dev has a limit on the number of events it + * can manage at a time. An *open system* event dev does not have a + * limit and will specify this as -1. + */ + uint32_t event_dev_cap; + /**< Event device capabilities(RTE_EVENT_DEV_CAP_)*/ +}; + +/** + * Retrieve the contextual information of an event device. + * + * @param dev_id + * The identifier of the device. + * + * @param[out] dev_info + * A pointer to a structure of type *rte_event_dev_info* to be filled with the + * contextual information of the device. + * + * @return + * - 0: Success, driver updates the contextual information of the event device + * - <0: Error code returned by the driver info get function. + * + */ +int +rte_event_dev_info_get(uint8_t dev_id, struct rte_event_dev_info *dev_info); + +/** + * The count of ports. + */ +#define RTE_EVENT_DEV_ATTR_PORT_COUNT 0 +/** + * The count of queues. + */ +#define RTE_EVENT_DEV_ATTR_QUEUE_COUNT 1 +/** + * The status of the device, zero for stopped, non-zero for started. + */ +#define RTE_EVENT_DEV_ATTR_STARTED 2 + +/** + * Get an attribute from a device. + * + * @param dev_id Eventdev id + * @param attr_id The attribute ID to retrieve + * @param[out] attr_value A pointer that will be filled in with the attribute + * value if successful. + * + * @return + * - 0: Successfully retrieved attribute value + * - -EINVAL: Invalid device or *attr_id* provided, or *attr_value* is NULL + */ +int +rte_event_dev_attr_get(uint8_t dev_id, uint32_t attr_id, + uint32_t *attr_value); + + +/* Event device configuration bitmap flags */ +#define RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT (1ULL << 0) +/**< Override the global *dequeue_timeout_ns* and use per dequeue timeout in ns. + * @see rte_event_dequeue_timeout_ticks(), rte_event_dequeue_burst() + */ + +/** Event device configuration structure */ +struct rte_event_dev_config { + uint32_t dequeue_timeout_ns; + /**< rte_event_dequeue_burst() timeout on this device. + * This value should be in the range of *min_dequeue_timeout_ns* and + * *max_dequeue_timeout_ns* which previously provided in + * rte_event_dev_info_get() + * The value 0 is allowed, in which case, default dequeue timeout used. + * @see RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT + */ + int32_t nb_events_limit; + /**< In a *closed system* this field is the limit on maximum number of + * events that can be inflight in the eventdev at a given time. The + * limit is required to ensure that the finite space in a closed system + * is not overwhelmed. The value cannot exceed the *max_num_events* + * as provided by rte_event_dev_info_get(). + * This value should be set to -1 for *open system*. + */ + uint8_t nb_event_queues; + /**< Number of event queues to configure on this device. + * This value cannot exceed the *max_event_queues* which previously + * provided in rte_event_dev_info_get() + */ + uint8_t nb_event_ports; + /**< Number of event ports to configure on this device. + * This value cannot exceed the *max_event_ports* which previously + * provided in rte_event_dev_info_get() + */ + uint32_t nb_event_queue_flows; + /**< Number of flows for any event queue on this device. + * This value cannot exceed the *max_event_queue_flows* which previously + * provided in rte_event_dev_info_get() + */ + uint32_t nb_event_port_dequeue_depth; + /**< Maximum number of events can be dequeued at a time from an + * event port by this device. + * This value cannot exceed the *max_event_port_dequeue_depth* + * which previously provided in rte_event_dev_info_get(). + * Ignored when device is not RTE_EVENT_DEV_CAP_BURST_MODE capable. + * @see rte_event_port_setup() + */ + uint32_t nb_event_port_enqueue_depth; + /**< Maximum number of events can be enqueued at a time from an + * event port by this device. + * This value cannot exceed the *max_event_port_enqueue_depth* + * which previously provided in rte_event_dev_info_get(). + * Ignored when device is not RTE_EVENT_DEV_CAP_BURST_MODE capable. + * @see rte_event_port_setup() + */ + uint32_t event_dev_cfg; + /**< Event device config flags(RTE_EVENT_DEV_CFG_)*/ +}; + +/** + * Configure an event device. + * + * This function must be invoked first before any other function in the + * API. This function can also be re-invoked when a device is in the + * stopped state. + * + * The caller may use rte_event_dev_info_get() to get the capability of each + * resources available for this event device. + * + * @param dev_id + * The identifier of the device to configure. + * @param dev_conf + * The event device configuration structure. + * + * @return + * - 0: Success, device configured. + * - <0: Error code returned by the driver configuration function. + */ +int +rte_event_dev_configure(uint8_t dev_id, + const struct rte_event_dev_config *dev_conf); + + +/* Event queue specific APIs */ + +/* Event queue configuration bitmap flags */ +#define RTE_EVENT_QUEUE_CFG_ALL_TYPES (1ULL << 0) +/**< Allow ATOMIC,ORDERED,PARALLEL schedule type enqueue + * + * @see RTE_SCHED_TYPE_ORDERED, RTE_SCHED_TYPE_ATOMIC, RTE_SCHED_TYPE_PARALLEL + * @see rte_event_enqueue_burst() + */ +#define RTE_EVENT_QUEUE_CFG_SINGLE_LINK (1ULL << 1) +/**< This event queue links only to a single event port. + * + * @see rte_event_port_setup(), rte_event_port_link() + */ + +/** Event queue configuration structure */ +struct rte_event_queue_conf { + uint32_t nb_atomic_flows; + /**< The maximum number of active flows this queue can track at any + * given time. If the queue is configured for atomic scheduling (by + * applying the RTE_EVENT_QUEUE_CFG_ALL_TYPES flag to event_queue_cfg + * or RTE_SCHED_TYPE_ATOMIC flag to schedule_type), then the + * value must be in the range of [1, nb_event_queue_flows], which was + * previously provided in rte_event_dev_configure(). + */ + uint32_t nb_atomic_order_sequences; + /**< The maximum number of outstanding events waiting to be + * reordered by this queue. In other words, the number of entries in + * this queue’s reorder buffer.When the number of events in the + * reorder buffer reaches to *nb_atomic_order_sequences* then the + * scheduler cannot schedule the events from this queue and invalid + * event will be returned from dequeue until one or more entries are + * freed up/released. + * If the queue is configured for ordered scheduling (by applying the + * RTE_EVENT_QUEUE_CFG_ALL_TYPES flag to event_queue_cfg or + * RTE_SCHED_TYPE_ORDERED flag to schedule_type), then the value must + * be in the range of [1, nb_event_queue_flows], which was + * previously supplied to rte_event_dev_configure(). + */ + uint32_t event_queue_cfg; + /**< Queue cfg flags(EVENT_QUEUE_CFG_) */ + uint8_t schedule_type; + /**< Queue schedule type(RTE_SCHED_TYPE_*). + * Valid when RTE_EVENT_QUEUE_CFG_ALL_TYPES bit is not set in + * event_queue_cfg. + */ + uint8_t priority; + /**< Priority for this event queue relative to other event queues. + * The requested priority should in the range of + * [RTE_EVENT_DEV_PRIORITY_HIGHEST, RTE_EVENT_DEV_PRIORITY_LOWEST]. + * The implementation shall normalize the requested priority to + * event device supported priority value. + * Valid when the device has RTE_EVENT_DEV_CAP_QUEUE_QOS capability + */ +}; + +/** + * Retrieve the default configuration information of an event queue designated + * by its *queue_id* from the event driver for an event device. + * + * This function intended to be used in conjunction with rte_event_queue_setup() + * where caller needs to set up the queue by overriding few default values. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the event queue to get the configuration information. + * The value must be in the range [0, nb_event_queues - 1] + * previously supplied to rte_event_dev_configure(). + * @param[out] queue_conf + * The pointer to the default event queue configuration data. + * @return + * - 0: Success, driver updates the default event queue configuration data. + * - <0: Error code returned by the driver info get function. + * + * @see rte_event_queue_setup() + * + */ +int +rte_event_queue_default_conf_get(uint8_t dev_id, uint8_t queue_id, + struct rte_event_queue_conf *queue_conf); + +/** + * Allocate and set up an event queue for an event device. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the event queue to setup. The value must be in the range + * [0, nb_event_queues - 1] previously supplied to rte_event_dev_configure(). + * @param queue_conf + * The pointer to the configuration data to be used for the event queue. + * NULL value is allowed, in which case default configuration used. + * + * @see rte_event_queue_default_conf_get() + * + * @return + * - 0: Success, event queue correctly set up. + * - <0: event queue configuration failed + */ +int +rte_event_queue_setup(uint8_t dev_id, uint8_t queue_id, + const struct rte_event_queue_conf *queue_conf); + +/** + * The priority of the queue. + */ +#define RTE_EVENT_QUEUE_ATTR_PRIORITY 0 +/** + * The number of atomic flows configured for the queue. + */ +#define RTE_EVENT_QUEUE_ATTR_NB_ATOMIC_FLOWS 1 +/** + * The number of atomic order sequences configured for the queue. + */ +#define RTE_EVENT_QUEUE_ATTR_NB_ATOMIC_ORDER_SEQUENCES 2 +/** + * The cfg flags for the queue. + */ +#define RTE_EVENT_QUEUE_ATTR_EVENT_QUEUE_CFG 3 +/** + * The schedule type of the queue. + */ +#define RTE_EVENT_QUEUE_ATTR_SCHEDULE_TYPE 4 + +/** + * Get an attribute from a queue. + * + * @param dev_id + * Eventdev id + * @param queue_id + * Eventdev queue id + * @param attr_id + * The attribute ID to retrieve + * @param[out] attr_value + * A pointer that will be filled in with the attribute value if successful + * + * @return + * - 0: Successfully returned value + * - -EINVAL: invalid device, queue or attr_id provided, or attr_value was + * NULL + * - -EOVERFLOW: returned when attr_id is set to + * RTE_EVENT_QUEUE_ATTR_SCHEDULE_TYPE and event_queue_cfg is set to + * RTE_EVENT_QUEUE_CFG_ALL_TYPES + */ +int +rte_event_queue_attr_get(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id, + uint32_t *attr_value); + +/* Event port specific APIs */ + +/** Event port configuration structure */ +struct rte_event_port_conf { + int32_t new_event_threshold; + /**< A backpressure threshold for new event enqueues on this port. + * Use for *closed system* event dev where event capacity is limited, + * and cannot exceed the capacity of the event dev. + * Configuring ports with different thresholds can make higher priority + * traffic less likely to be backpressured. + * For example, a port used to inject NIC Rx packets into the event dev + * can have a lower threshold so as not to overwhelm the device, + * while ports used for worker pools can have a higher threshold. + * This value cannot exceed the *nb_events_limit* + * which was previously supplied to rte_event_dev_configure(). + * This should be set to '-1' for *open system*. + */ + uint16_t dequeue_depth; + /**< Configure number of bulk dequeues for this event port. + * This value cannot exceed the *nb_event_port_dequeue_depth* + * which previously supplied to rte_event_dev_configure(). + * Ignored when device is not RTE_EVENT_DEV_CAP_BURST_MODE capable. + */ + uint16_t enqueue_depth; + /**< Configure number of bulk enqueues for this event port. + * This value cannot exceed the *nb_event_port_enqueue_depth* + * which previously supplied to rte_event_dev_configure(). + * Ignored when device is not RTE_EVENT_DEV_CAP_BURST_MODE capable. + */ + uint8_t disable_implicit_release; + /**< Configure the port not to release outstanding events in + * rte_event_dev_dequeue_burst(). If true, all events received through + * the port must be explicitly released with RTE_EVENT_OP_RELEASE or + * RTE_EVENT_OP_FORWARD. Must be false when the device is not + * RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE capable. + */ +}; + +/** + * Retrieve the default configuration information of an event port designated + * by its *port_id* from the event driver for an event device. + * + * This function intended to be used in conjunction with rte_event_port_setup() + * where caller needs to set up the port by overriding few default values. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The index of the event port to get the configuration information. + * The value must be in the range [0, nb_event_ports - 1] + * previously supplied to rte_event_dev_configure(). + * @param[out] port_conf + * The pointer to the default event port configuration data + * @return + * - 0: Success, driver updates the default event port configuration data. + * - <0: Error code returned by the driver info get function. + * + * @see rte_event_port_setup() + * + */ +int +rte_event_port_default_conf_get(uint8_t dev_id, uint8_t port_id, + struct rte_event_port_conf *port_conf); + +/** + * Allocate and set up an event port for an event device. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The index of the event port to setup. The value must be in the range + * [0, nb_event_ports - 1] previously supplied to rte_event_dev_configure(). + * @param port_conf + * The pointer to the configuration data to be used for the queue. + * NULL value is allowed, in which case default configuration used. + * + * @see rte_event_port_default_conf_get() + * + * @return + * - 0: Success, event port correctly set up. + * - <0: Port configuration failed + * - (-EDQUOT) Quota exceeded(Application tried to link the queue configured + * with RTE_EVENT_QUEUE_CFG_SINGLE_LINK to more than one event ports) + */ +int +rte_event_port_setup(uint8_t dev_id, uint8_t port_id, + const struct rte_event_port_conf *port_conf); + +/** + * The queue depth of the port on the enqueue side + */ +#define RTE_EVENT_PORT_ATTR_ENQ_DEPTH 0 +/** + * The queue depth of the port on the dequeue side + */ +#define RTE_EVENT_PORT_ATTR_DEQ_DEPTH 1 +/** + * The new event threshold of the port + */ +#define RTE_EVENT_PORT_ATTR_NEW_EVENT_THRESHOLD 2 + +/** + * Get an attribute from a port. + * + * @param dev_id + * Eventdev id + * @param port_id + * Eventdev port id + * @param attr_id + * The attribute ID to retrieve + * @param[out] attr_value + * A pointer that will be filled in with the attribute value if successful + * + * @return + * - 0: Successfully returned value + * - (-EINVAL) Invalid device, port or attr_id, or attr_value was NULL + */ +int +rte_event_port_attr_get(uint8_t dev_id, uint8_t port_id, uint32_t attr_id, + uint32_t *attr_value); + +/** + * Start an event device. + * + * The device start step is the last one and consists of setting the event + * queues to start accepting the events and schedules to event ports. + * + * On success, all basic functions exported by the API (event enqueue, + * event dequeue and so on) can be invoked. + * + * @param dev_id + * Event device identifier + * @return + * - 0: Success, device started. + * - -ESTALE : Not all ports of the device are configured + * - -ENOLINK: Not all queues are linked, which could lead to deadlock. + */ +int +rte_event_dev_start(uint8_t dev_id); + +/** + * Stop an event device. + * + * This function causes all queued events to be drained, including those + * residing in event ports. While draining events out of the device, this + * function calls the user-provided flush callback (if one was registered) once + * per event. + * + * The device can be restarted with a call to rte_event_dev_start(). Threads + * that continue to enqueue/dequeue while the device is stopped, or being + * stopped, will result in undefined behavior. This includes event adapters, + * which must be stopped prior to stopping the eventdev. + * + * @param dev_id + * Event device identifier. + * + * @see rte_event_dev_stop_flush_callback_register() + */ +void +rte_event_dev_stop(uint8_t dev_id); + +typedef void (*eventdev_stop_flush_t)(uint8_t dev_id, struct rte_event event, + void *arg); +/**< Callback function called during rte_event_dev_stop(), invoked once per + * flushed event. + */ + +/** + * Registers a callback function to be invoked during rte_event_dev_stop() for + * each flushed event. This function can be used to properly dispose of queued + * events, for example events containing memory pointers. + * + * The callback function is only registered for the calling process. The + * callback function must be registered in every process that can call + * rte_event_dev_stop(). + * + * To unregister a callback, call this function with a NULL callback pointer. + * + * @param dev_id + * The identifier of the device. + * @param callback + * Callback function invoked once per flushed event. + * @param userdata + * Argument supplied to callback. + * + * @return + * - 0 on success. + * - -EINVAL if *dev_id* is invalid + * + * @see rte_event_dev_stop() + */ +int +rte_event_dev_stop_flush_callback_register(uint8_t dev_id, + eventdev_stop_flush_t callback, void *userdata); + +/** + * Close an event device. The device cannot be restarted! + * + * @param dev_id + * Event device identifier + * + * @return + * - 0 on successfully closing device + * - <0 on failure to close device + * - (-EAGAIN) if device is busy + */ +int +rte_event_dev_close(uint8_t dev_id); + +/* Scheduler type definitions */ +#define RTE_SCHED_TYPE_ORDERED 0 +/**< Ordered scheduling + * + * Events from an ordered flow of an event queue can be scheduled to multiple + * ports for concurrent processing while maintaining the original event order. + * This scheme enables the user to achieve high single flow throughput by + * avoiding SW synchronization for ordering between ports which bound to cores. + * + * The source flow ordering from an event queue is maintained when events are + * enqueued to their destination queue within the same ordered flow context. + * An event port holds the context until application call + * rte_event_dequeue_burst() from the same port, which implicitly releases + * the context. + * User may allow the scheduler to release the context earlier than that + * by invoking rte_event_enqueue_burst() with RTE_EVENT_OP_RELEASE operation. + * + * Events from the source queue appear in their original order when dequeued + * from a destination queue. + * Event ordering is based on the received event(s), but also other + * (newly allocated or stored) events are ordered when enqueued within the same + * ordered context. Events not enqueued (e.g. released or stored) within the + * context are considered missing from reordering and are skipped at this time + * (but can be ordered again within another context). + * + * @see rte_event_queue_setup(), rte_event_dequeue_burst(), RTE_EVENT_OP_RELEASE + */ + +#define RTE_SCHED_TYPE_ATOMIC 1 +/**< Atomic scheduling + * + * Events from an atomic flow of an event queue can be scheduled only to a + * single port at a time. The port is guaranteed to have exclusive (atomic) + * access to the associated flow context, which enables the user to avoid SW + * synchronization. Atomic flows also help to maintain event ordering + * since only one port at a time can process events from a flow of an + * event queue. + * + * The atomic queue synchronization context is dedicated to the port until + * application call rte_event_dequeue_burst() from the same port, + * which implicitly releases the context. User may allow the scheduler to + * release the context earlier than that by invoking rte_event_enqueue_burst() + * with RTE_EVENT_OP_RELEASE operation. + * + * @see rte_event_queue_setup(), rte_event_dequeue_burst(), RTE_EVENT_OP_RELEASE + */ + +#define RTE_SCHED_TYPE_PARALLEL 2 +/**< Parallel scheduling + * + * The scheduler performs priority scheduling, load balancing, etc. functions + * but does not provide additional event synchronization or ordering. + * It is free to schedule events from a single parallel flow of an event queue + * to multiple events ports for concurrent processing. + * The application is responsible for flow context synchronization and + * event ordering (SW synchronization). + * + * @see rte_event_queue_setup(), rte_event_dequeue_burst() + */ + +/* Event types to classify the event source */ +#define RTE_EVENT_TYPE_ETHDEV 0x0 +/**< The event generated from ethdev subsystem */ +#define RTE_EVENT_TYPE_CRYPTODEV 0x1 +/**< The event generated from crypodev subsystem */ +#define RTE_EVENT_TYPE_TIMER 0x2 +/**< The event generated from event timer adapter */ +#define RTE_EVENT_TYPE_CPU 0x3 +/**< The event generated from cpu for pipelining. + * Application may use *sub_event_type* to further classify the event + */ +#define RTE_EVENT_TYPE_ETH_RX_ADAPTER 0x4 +/**< The event generated from event eth Rx adapter */ +#define RTE_EVENT_TYPE_MAX 0x10 +/**< Maximum number of event types */ + +/* Event enqueue operations */ +#define RTE_EVENT_OP_NEW 0 +/**< The event producers use this operation to inject a new event to the + * event device. + */ +#define RTE_EVENT_OP_FORWARD 1 +/**< The CPU use this operation to forward the event to different event queue or + * change to new application specific flow or schedule type to enable + * pipelining. + * + * This operation must only be enqueued to the same port that the + * event to be forwarded was dequeued from. + */ +#define RTE_EVENT_OP_RELEASE 2 +/**< Release the flow context associated with the schedule type. + * + * If current flow's scheduler type method is *RTE_SCHED_TYPE_ATOMIC* + * then this function hints the scheduler that the user has completed critical + * section processing in the current atomic context. + * The scheduler is now allowed to schedule events from the same flow from + * an event queue to another port. However, the context may be still held + * until the next rte_event_dequeue_burst() call, this call allows but does not + * force the scheduler to release the context early. + * + * Early atomic context release may increase parallelism and thus system + * performance, but the user needs to design carefully the split into critical + * vs non-critical sections. + * + * If current flow's scheduler type method is *RTE_SCHED_TYPE_ORDERED* + * then this function hints the scheduler that the user has done all that need + * to maintain event order in the current ordered context. + * The scheduler is allowed to release the ordered context of this port and + * avoid reordering any following enqueues. + * + * Early ordered context release may increase parallelism and thus system + * performance. + * + * If current flow's scheduler type method is *RTE_SCHED_TYPE_PARALLEL* + * or no scheduling context is held then this function may be an NOOP, + * depending on the implementation. + * + * This operation must only be enqueued to the same port that the + * event to be released was dequeued from. + * + */ + +/** + * The generic *rte_event* structure to hold the event attributes + * for dequeue and enqueue operation + */ +RTE_STD_C11 +struct rte_event { + /** WORD0 */ + union { + uint64_t event; + /** Event attributes for dequeue or enqueue operation */ + struct { + uint32_t flow_id:20; + /**< Targeted flow identifier for the enqueue and + * dequeue operation. + * The value must be in the range of + * [0, nb_event_queue_flows - 1] which + * previously supplied to rte_event_dev_configure(). + */ + uint32_t sub_event_type:8; + /**< Sub-event types based on the event source. + * @see RTE_EVENT_TYPE_CPU + */ + uint32_t event_type:4; + /**< Event type to classify the event source. + * @see RTE_EVENT_TYPE_ETHDEV, (RTE_EVENT_TYPE_*) + */ + uint8_t op:2; + /**< The type of event enqueue operation - new/forward/ + * etc.This field is not preserved across an instance + * and is undefined on dequeue. + * @see RTE_EVENT_OP_NEW, (RTE_EVENT_OP_*) + */ + uint8_t rsvd:4; + /**< Reserved for future use */ + uint8_t sched_type:2; + /**< Scheduler synchronization type (RTE_SCHED_TYPE_*) + * associated with flow id on a given event queue + * for the enqueue and dequeue operation. + */ + uint8_t queue_id; + /**< Targeted event queue identifier for the enqueue or + * dequeue operation. + * The value must be in the range of + * [0, nb_event_queues - 1] which previously supplied to + * rte_event_dev_configure(). + */ + uint8_t priority; + /**< Event priority relative to other events in the + * event queue. The requested priority should in the + * range of [RTE_EVENT_DEV_PRIORITY_HIGHEST, + * RTE_EVENT_DEV_PRIORITY_LOWEST]. + * The implementation shall normalize the requested + * priority to supported priority value. + * Valid when the device has + * RTE_EVENT_DEV_CAP_EVENT_QOS capability. + */ + uint8_t impl_opaque; + /**< Implementation specific opaque value. + * An implementation may use this field to hold + * implementation specific value to share between + * dequeue and enqueue operation. + * The application should not modify this field. + */ + }; + }; + /** WORD1 */ + union { + uint64_t u64; + /**< Opaque 64-bit value */ + void *event_ptr; + /**< Opaque event pointer */ + struct rte_mbuf *mbuf; + /**< mbuf pointer if dequeued event is associated with mbuf */ + }; +}; + +/* Ethdev Rx adapter capability bitmap flags */ +#define RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT 0x1 +/**< This flag is sent when the packet transfer mechanism is in HW. + * Ethdev can send packets to the event device using internal event port. + */ +#define RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ 0x2 +/**< Adapter supports multiple event queues per ethdev. Every ethdev + * Rx queue can be connected to a unique event queue. + */ +#define RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID 0x4 +/**< The application can override the adapter generated flow ID in the + * event. This flow ID can be specified when adding an ethdev Rx queue + * to the adapter using the ev member of struct rte_event_eth_rx_adapter + * @see struct rte_event_eth_rx_adapter_queue_conf::ev + * @see struct rte_event_eth_rx_adapter_queue_conf::rx_queue_flags + */ + +/** + * Retrieve the event device's ethdev Rx adapter capabilities for the + * specified ethernet port + * + * @param dev_id + * The identifier of the device. + * + * @param eth_port_id + * The identifier of the ethernet device. + * + * @param[out] caps + * A pointer to memory filled with Rx event adapter capabilities. + * + * @return + * - 0: Success, driver provides Rx event adapter capabilities for the + * ethernet device. + * - <0: Error code returned by the driver function. + * + */ +int +rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id, + uint32_t *caps); + +#define RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT (1ULL << 0) +/**< This flag is set when the timer mechanism is in HW. */ + +/** + * Retrieve the event device's timer adapter capabilities. + * + * @param dev_id + * The identifier of the device. + * + * @param[out] caps + * A pointer to memory to be filled with event timer adapter capabilities. + * + * @return + * - 0: Success, driver provided event timer adapter capabilities. + * - <0: Error code returned by the driver function. + */ +int __rte_experimental +rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps); + +/* Crypto adapter capability bitmap flag */ +#define RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_NEW 0x1 +/**< Flag indicates HW is capable of generating events in + * RTE_EVENT_OP_NEW enqueue operation. Cryptodev will send + * packets to the event device as new events using an internal + * event port. + */ + +#define RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_OP_FWD 0x2 +/**< Flag indicates HW is capable of generating events in + * RTE_EVENT_OP_FORWARD enqueue operation. Cryptodev will send + * packets to the event device as forwarded event using an + * internal event port. + */ + +#define RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND 0x4 +/**< Flag indicates HW is capable of mapping crypto queue pair to + * event queue. + */ + +#define RTE_EVENT_CRYPTO_ADAPTER_CAP_SESSION_PRIVATE_DATA 0x8 +/**< Flag indicates HW/SW suports a mechanism to store and retrieve + * the private data information along with the crypto session. + */ + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Retrieve the event device's crypto adapter capabilities for the + * specified cryptodev device + * + * @param dev_id + * The identifier of the device. + * + * @param cdev_id + * The identifier of the cryptodev device. + * + * @param[out] caps + * A pointer to memory filled with event adapter capabilities. + * It is expected to be pre-allocated & initialized by caller. + * + * @return + * - 0: Success, driver provides event adapter capabilities for the + * cryptodev device. + * - <0: Error code returned by the driver function. + * + */ +int __rte_experimental +rte_event_crypto_adapter_caps_get(uint8_t dev_id, uint8_t cdev_id, + uint32_t *caps); + +struct rte_eventdev_ops; +struct rte_eventdev; + +typedef uint16_t (*event_enqueue_t)(void *port, const struct rte_event *ev); +/**< @internal Enqueue event on port of a device */ + +typedef uint16_t (*event_enqueue_burst_t)(void *port, + const struct rte_event ev[], uint16_t nb_events); +/**< @internal Enqueue burst of events on port of a device */ + +typedef uint16_t (*event_dequeue_t)(void *port, struct rte_event *ev, + uint64_t timeout_ticks); +/**< @internal Dequeue event from port of a device */ + +typedef uint16_t (*event_dequeue_burst_t)(void *port, struct rte_event ev[], + uint16_t nb_events, uint64_t timeout_ticks); +/**< @internal Dequeue burst of events from port of a device */ + +#define RTE_EVENTDEV_NAME_MAX_LEN (64) +/**< @internal Max length of name of event PMD */ + +/** + * @internal + * The data part, with no function pointers, associated with each device. + * + * This structure is safe to place in shared memory to be common among + * different processes in a multi-process configuration. + */ +struct rte_eventdev_data { + int socket_id; + /**< Socket ID where memory is allocated */ + uint8_t dev_id; + /**< Device ID for this instance */ + uint8_t nb_queues; + /**< Number of event queues. */ + uint8_t nb_ports; + /**< Number of event ports. */ + void **ports; + /**< Array of pointers to ports. */ + struct rte_event_port_conf *ports_cfg; + /**< Array of port configuration structures. */ + struct rte_event_queue_conf *queues_cfg; + /**< Array of queue configuration structures. */ + uint16_t *links_map; + /**< Memory to store queues to port connections. */ + void *dev_private; + /**< PMD-specific private data */ + uint32_t event_dev_cap; + /**< Event device capabilities(RTE_EVENT_DEV_CAP_)*/ + struct rte_event_dev_config dev_conf; + /**< Configuration applied to device. */ + uint8_t service_inited; + /* Service initialization state */ + uint32_t service_id; + /* Service ID*/ + void *dev_stop_flush_arg; + /**< User-provided argument for event flush function */ + + RTE_STD_C11 + uint8_t dev_started : 1; + /**< Device state: STARTED(1)/STOPPED(0) */ + + char name[RTE_EVENTDEV_NAME_MAX_LEN]; + /**< Unique identifier name */ +} __rte_cache_aligned; + +/** @internal The data structure associated with each event device. */ +struct rte_eventdev { + event_enqueue_t enqueue; + /**< Pointer to PMD enqueue function. */ + event_enqueue_burst_t enqueue_burst; + /**< Pointer to PMD enqueue burst function. */ + event_enqueue_burst_t enqueue_new_burst; + /**< Pointer to PMD enqueue burst function(op new variant) */ + event_enqueue_burst_t enqueue_forward_burst; + /**< Pointer to PMD enqueue burst function(op forward variant) */ + event_dequeue_t dequeue; + /**< Pointer to PMD dequeue function. */ + event_dequeue_burst_t dequeue_burst; + /**< Pointer to PMD dequeue burst function. */ + + struct rte_eventdev_data *data; + /**< Pointer to device data */ + struct rte_eventdev_ops *dev_ops; + /**< Functions exported by PMD */ + struct rte_device *dev; + /**< Device info. supplied by probing */ + + RTE_STD_C11 + uint8_t attached : 1; + /**< Flag indicating the device is attached */ +} __rte_cache_aligned; + +extern struct rte_eventdev *rte_eventdevs; +/** @internal The pool of rte_eventdev structures. */ + +static __rte_always_inline uint16_t +__rte_event_enqueue_burst(uint8_t dev_id, uint8_t port_id, + const struct rte_event ev[], uint16_t nb_events, + const event_enqueue_burst_t fn) +{ + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + if (dev_id >= RTE_EVENT_MAX_DEVS || !rte_eventdevs[dev_id].attached) { + rte_errno = -EINVAL; + return 0; + } + + if (port_id >= dev->data->nb_ports) { + rte_errno = -EINVAL; + return 0; + } +#endif + /* + * Allow zero cost non burst mode routine invocation if application + * requests nb_events as const one + */ + if (nb_events == 1) + return (*dev->enqueue)(dev->data->ports[port_id], ev); + else + return fn(dev->data->ports[port_id], ev, nb_events); +} + +/** + * Enqueue a burst of events objects or an event object supplied in *rte_event* + * structure on an event device designated by its *dev_id* through the event + * port specified by *port_id*. Each event object specifies the event queue on + * which it will be enqueued. + * + * The *nb_events* parameter is the number of event objects to enqueue which are + * supplied in the *ev* array of *rte_event* structure. + * + * Event operations RTE_EVENT_OP_FORWARD and RTE_EVENT_OP_RELEASE must only be + * enqueued to the same port that their associated events were dequeued from. + * + * The rte_event_enqueue_burst() function returns the number of + * events objects it actually enqueued. A return value equal to *nb_events* + * means that all event objects have been enqueued. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The identifier of the event port. + * @param ev + * Points to an array of *nb_events* objects of type *rte_event* structure + * which contain the event object enqueue operations to be processed. + * @param nb_events + * The number of event objects to enqueue, typically number of + * rte_event_port_enqueue_depth() available for this port. + * + * @return + * The number of event objects actually enqueued on the event device. The + * return value can be less than the value of the *nb_events* parameter when + * the event devices queue is full or if invalid parameters are specified in a + * *rte_event*. If the return value is less than *nb_events*, the remaining + * events at the end of ev[] are not consumed and the caller has to take care + * of them, and rte_errno is set accordingly. Possible errno values include: + * - -EINVAL The port ID is invalid, device ID is invalid, an event's queue + * ID is invalid, or an event's sched type doesn't match the + * capabilities of the destination queue. + * - -ENOSPC The event port was backpressured and unable to enqueue + * one or more events. This error code is only applicable to + * closed systems. + * @see rte_event_port_enqueue_depth() + */ +static inline uint16_t +rte_event_enqueue_burst(uint8_t dev_id, uint8_t port_id, + const struct rte_event ev[], uint16_t nb_events) +{ + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + return __rte_event_enqueue_burst(dev_id, port_id, ev, nb_events, + dev->enqueue_burst); +} + +/** + * Enqueue a burst of events objects of operation type *RTE_EVENT_OP_NEW* on + * an event device designated by its *dev_id* through the event port specified + * by *port_id*. + * + * Provides the same functionality as rte_event_enqueue_burst(), expect that + * application can use this API when the all objects in the burst contains + * the enqueue operation of the type *RTE_EVENT_OP_NEW*. This specialized + * function can provide the additional hint to the PMD and optimize if possible. + * + * The rte_event_enqueue_new_burst() result is undefined if the enqueue burst + * has event object of operation type != RTE_EVENT_OP_NEW. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The identifier of the event port. + * @param ev + * Points to an array of *nb_events* objects of type *rte_event* structure + * which contain the event object enqueue operations to be processed. + * @param nb_events + * The number of event objects to enqueue, typically number of + * rte_event_port_enqueue_depth() available for this port. + * + * @return + * The number of event objects actually enqueued on the event device. The + * return value can be less than the value of the *nb_events* parameter when + * the event devices queue is full or if invalid parameters are specified in a + * *rte_event*. If the return value is less than *nb_events*, the remaining + * events at the end of ev[] are not consumed and the caller has to take care + * of them, and rte_errno is set accordingly. Possible errno values include: + * - -EINVAL The port ID is invalid, device ID is invalid, an event's queue + * ID is invalid, or an event's sched type doesn't match the + * capabilities of the destination queue. + * - -ENOSPC The event port was backpressured and unable to enqueue + * one or more events. This error code is only applicable to + * closed systems. + * @see rte_event_port_enqueue_depth() rte_event_enqueue_burst() + */ +static inline uint16_t +rte_event_enqueue_new_burst(uint8_t dev_id, uint8_t port_id, + const struct rte_event ev[], uint16_t nb_events) +{ + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + return __rte_event_enqueue_burst(dev_id, port_id, ev, nb_events, + dev->enqueue_new_burst); +} + +/** + * Enqueue a burst of events objects of operation type *RTE_EVENT_OP_FORWARD* + * on an event device designated by its *dev_id* through the event port + * specified by *port_id*. + * + * Provides the same functionality as rte_event_enqueue_burst(), expect that + * application can use this API when the all objects in the burst contains + * the enqueue operation of the type *RTE_EVENT_OP_FORWARD*. This specialized + * function can provide the additional hint to the PMD and optimize if possible. + * + * The rte_event_enqueue_new_burst() result is undefined if the enqueue burst + * has event object of operation type != RTE_EVENT_OP_FORWARD. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The identifier of the event port. + * @param ev + * Points to an array of *nb_events* objects of type *rte_event* structure + * which contain the event object enqueue operations to be processed. + * @param nb_events + * The number of event objects to enqueue, typically number of + * rte_event_port_enqueue_depth() available for this port. + * + * @return + * The number of event objects actually enqueued on the event device. The + * return value can be less than the value of the *nb_events* parameter when + * the event devices queue is full or if invalid parameters are specified in a + * *rte_event*. If the return value is less than *nb_events*, the remaining + * events at the end of ev[] are not consumed and the caller has to take care + * of them, and rte_errno is set accordingly. Possible errno values include: + * - -EINVAL The port ID is invalid, device ID is invalid, an event's queue + * ID is invalid, or an event's sched type doesn't match the + * capabilities of the destination queue. + * - -ENOSPC The event port was backpressured and unable to enqueue + * one or more events. This error code is only applicable to + * closed systems. + * @see rte_event_port_enqueue_depth() rte_event_enqueue_burst() + */ +static inline uint16_t +rte_event_enqueue_forward_burst(uint8_t dev_id, uint8_t port_id, + const struct rte_event ev[], uint16_t nb_events) +{ + const struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + + return __rte_event_enqueue_burst(dev_id, port_id, ev, nb_events, + dev->enqueue_forward_burst); +} + +/** + * Converts nanoseconds to *timeout_ticks* value for rte_event_dequeue_burst() + * + * If the device is configured with RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT flag + * then application can use this function to convert timeout value in + * nanoseconds to implementations specific timeout value supplied in + * rte_event_dequeue_burst() + * + * @param dev_id + * The identifier of the device. + * @param ns + * Wait time in nanosecond + * @param[out] timeout_ticks + * Value for the *timeout_ticks* parameter in rte_event_dequeue_burst() + * + * @return + * - 0 on success. + * - -ENOTSUP if the device doesn't support timeouts + * - -EINVAL if *dev_id* is invalid or *timeout_ticks* is NULL + * - other values < 0 on failure. + * + * @see rte_event_dequeue_burst(), RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT + * @see rte_event_dev_configure() + * + */ +int +rte_event_dequeue_timeout_ticks(uint8_t dev_id, uint64_t ns, + uint64_t *timeout_ticks); + +/** + * Dequeue a burst of events objects or an event object from the event port + * designated by its *event_port_id*, on an event device designated + * by its *dev_id*. + * + * rte_event_dequeue_burst() does not dictate the specifics of scheduling + * algorithm as each eventdev driver may have different criteria to schedule + * an event. However, in general, from an application perspective scheduler may + * use the following scheme to dispatch an event to the port. + * + * 1) Selection of event queue based on + * a) The list of event queues are linked to the event port. + * b) If the device has RTE_EVENT_DEV_CAP_QUEUE_QOS capability then event + * queue selection from list is based on event queue priority relative to + * other event queue supplied as *priority* in rte_event_queue_setup() + * c) If the device has RTE_EVENT_DEV_CAP_EVENT_QOS capability then event + * queue selection from the list is based on event priority supplied as + * *priority* in rte_event_enqueue_burst() + * 2) Selection of event + * a) The number of flows available in selected event queue. + * b) Schedule type method associated with the event + * + * The *nb_events* parameter is the maximum number of event objects to dequeue + * which are returned in the *ev* array of *rte_event* structure. + * + * The rte_event_dequeue_burst() function returns the number of events objects + * it actually dequeued. A return value equal to *nb_events* means that all + * event objects have been dequeued. + * + * The number of events dequeued is the number of scheduler contexts held by + * this port. These contexts are automatically released in the next + * rte_event_dequeue_burst() invocation if the port supports implicit + * releases, or invoking rte_event_enqueue_burst() with RTE_EVENT_OP_RELEASE + * operation can be used to release the contexts early. + * + * Event operations RTE_EVENT_OP_FORWARD and RTE_EVENT_OP_RELEASE must only be + * enqueued to the same port that their associated events were dequeued from. + * + * @param dev_id + * The identifier of the device. + * @param port_id + * The identifier of the event port. + * @param[out] ev + * Points to an array of *nb_events* objects of type *rte_event* structure + * for output to be populated with the dequeued event objects. + * @param nb_events + * The maximum number of event objects to dequeue, typically number of + * rte_event_port_dequeue_depth() available for this port. + * + * @param timeout_ticks + * - 0 no-wait, returns immediately if there is no event. + * - >0 wait for the event, if the device is configured with + * RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT then this function will wait until + * at least one event is available or *timeout_ticks* time. + * if the device is not configured with RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT + * then this function will wait until the event available or + * *dequeue_timeout_ns* ns which was previously supplied to + * rte_event_dev_configure() + * + * @return + * The number of event objects actually dequeued from the port. The return + * value can be less than the value of the *nb_events* parameter when the + * event port's queue is not full. + * + * @see rte_event_port_dequeue_depth() + */ +static inline uint16_t +rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], + uint16_t nb_events, uint64_t timeout_ticks) +{ + struct rte_eventdev *dev = &rte_eventdevs[dev_id]; + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG + if (dev_id >= RTE_EVENT_MAX_DEVS || !rte_eventdevs[dev_id].attached) { + rte_errno = -EINVAL; + return 0; + } + + if (port_id >= dev->data->nb_ports) { + rte_errno = -EINVAL; + return 0; + } +#endif + + /* + * Allow zero cost non burst mode routine invocation if application + * requests nb_events as const one + */ + if (nb_events == 1) + return (*dev->dequeue)( + dev->data->ports[port_id], ev, timeout_ticks); + else + return (*dev->dequeue_burst)( + dev->data->ports[port_id], ev, nb_events, + timeout_ticks); +} + +/** + * Link multiple source event queues supplied in *queues* to the destination + * event port designated by its *port_id* with associated service priority + * supplied in *priorities* on the event device designated by its *dev_id*. + * + * The link establishment shall enable the event port *port_id* from + * receiving events from the specified event queue(s) supplied in *queues* + * + * An event queue may link to one or more event ports. + * The number of links can be established from an event queue to event port is + * implementation defined. + * + * Event queue(s) to event port link establishment can be changed at runtime + * without re-configuring the device to support scaling and to reduce the + * latency of critical work by establishing the link with more event ports + * at runtime. + * + * @param dev_id + * The identifier of the device. + * + * @param port_id + * Event port identifier to select the destination port to link. + * + * @param queues + * Points to an array of *nb_links* event queues to be linked + * to the event port. + * NULL value is allowed, in which case this function links all the configured + * event queues *nb_event_queues* which previously supplied to + * rte_event_dev_configure() to the event port *port_id* + * + * @param priorities + * Points to an array of *nb_links* service priorities associated with each + * event queue link to event port. + * The priority defines the event port's servicing priority for + * event queue, which may be ignored by an implementation. + * The requested priority should in the range of + * [RTE_EVENT_DEV_PRIORITY_HIGHEST, RTE_EVENT_DEV_PRIORITY_LOWEST]. + * The implementation shall normalize the requested priority to + * implementation supported priority value. + * NULL value is allowed, in which case this function links the event queues + * with RTE_EVENT_DEV_PRIORITY_NORMAL servicing priority + * + * @param nb_links + * The number of links to establish. This parameter is ignored if queues is + * NULL. + * + * @return + * The number of links actually established. The return value can be less than + * the value of the *nb_links* parameter when the implementation has the + * limitation on specific queue to port link establishment or if invalid + * parameters are specified in *queues* + * If the return value is less than *nb_links*, the remaining links at the end + * of link[] are not established, and the caller has to take care of them. + * If return value is less than *nb_links* then implementation shall update the + * rte_errno accordingly, Possible rte_errno values are + * (-EDQUOT) Quota exceeded(Application tried to link the queue configured with + * RTE_EVENT_QUEUE_CFG_SINGLE_LINK to more than one event ports) + * (-EINVAL) Invalid parameter + * + */ +int +rte_event_port_link(uint8_t dev_id, uint8_t port_id, + const uint8_t queues[], const uint8_t priorities[], + uint16_t nb_links); + +/** + * Unlink multiple source event queues supplied in *queues* from the destination + * event port designated by its *port_id* on the event device designated + * by its *dev_id*. + * + * The unlink establishment shall disable the event port *port_id* from + * receiving events from the specified event queue *queue_id* + * + * Event queue(s) to event port unlink establishment can be changed at runtime + * without re-configuring the device. + * + * @param dev_id + * The identifier of the device. + * + * @param port_id + * Event port identifier to select the destination port to unlink. + * + * @param queues + * Points to an array of *nb_unlinks* event queues to be unlinked + * from the event port. + * NULL value is allowed, in which case this function unlinks all the + * event queue(s) from the event port *port_id*. + * + * @param nb_unlinks + * The number of unlinks to establish. This parameter is ignored if queues is + * NULL. + * + * @return + * The number of unlinks actually established. The return value can be less + * than the value of the *nb_unlinks* parameter when the implementation has the + * limitation on specific queue to port unlink establishment or + * if invalid parameters are specified. + * If the return value is less than *nb_unlinks*, the remaining queues at the + * end of queues[] are not established, and the caller has to take care of them. + * If return value is less than *nb_unlinks* then implementation shall update + * the rte_errno accordingly, Possible rte_errno values are + * (-EINVAL) Invalid parameter + * + */ +int +rte_event_port_unlink(uint8_t dev_id, uint8_t port_id, + uint8_t queues[], uint16_t nb_unlinks); + +/** + * Retrieve the list of source event queues and its associated service priority + * linked to the destination event port designated by its *port_id* + * on the event device designated by its *dev_id*. + * + * @param dev_id + * The identifier of the device. + * + * @param port_id + * Event port identifier. + * + * @param[out] queues + * Points to an array of *queues* for output. + * The caller has to allocate *RTE_EVENT_MAX_QUEUES_PER_DEV* bytes to + * store the event queue(s) linked with event port *port_id* + * + * @param[out] priorities + * Points to an array of *priorities* for output. + * The caller has to allocate *RTE_EVENT_MAX_QUEUES_PER_DEV* bytes to + * store the service priority associated with each event queue linked + * + * @return + * The number of links established on the event port designated by its + * *port_id*. + * - <0 on failure. + * + */ +int +rte_event_port_links_get(uint8_t dev_id, uint8_t port_id, + uint8_t queues[], uint8_t priorities[]); + +/** + * Retrieve the service ID of the event dev. If the adapter doesn't use + * a rte_service function, this function returns -ESRCH. + * + * @param dev_id + * The identifier of the device. + * + * @param [out] service_id + * A pointer to a uint32_t, to be filled in with the service id. + * + * @return + * - 0: Success + * - <0: Error code on failure, if the event dev doesn't use a rte_service + * function, this function returns -ESRCH. + */ +int +rte_event_dev_service_id_get(uint8_t dev_id, uint32_t *service_id); + +/** + * Dump internal information about *dev_id* to the FILE* provided in *f*. + * + * @param dev_id + * The identifier of the device. + * + * @param f + * A pointer to a file for output + * + * @return + * - 0: on success + * - <0: on failure. + */ +int +rte_event_dev_dump(uint8_t dev_id, FILE *f); + +/** Maximum name length for extended statistics counters */ +#define RTE_EVENT_DEV_XSTATS_NAME_SIZE 64 + +/** + * Selects the component of the eventdev to retrieve statistics from. + */ +enum rte_event_dev_xstats_mode { + RTE_EVENT_DEV_XSTATS_DEVICE, + RTE_EVENT_DEV_XSTATS_PORT, + RTE_EVENT_DEV_XSTATS_QUEUE, +}; + +/** + * A name-key lookup element for extended statistics. + * + * This structure is used to map between names and ID numbers + * for extended ethdev statistics. + */ +struct rte_event_dev_xstats_name { + char name[RTE_EVENT_DEV_XSTATS_NAME_SIZE]; +}; + +/** + * Retrieve names of extended statistics of an event device. + * + * @param dev_id + * The identifier of the event device. + * @param mode + * The mode of statistics to retrieve. Choices include the device statistics, + * port statistics or queue statistics. + * @param queue_port_id + * Used to specify the port or queue number in queue or port mode, and is + * ignored in device mode. + * @param[out] xstats_names + * Block of memory to insert names into. Must be at least size in capacity. + * If set to NULL, function returns required capacity. + * @param[out] ids + * Block of memory to insert ids into. Must be at least size in capacity. + * If set to NULL, function returns required capacity. The id values returned + * can be passed to *rte_event_dev_xstats_get* to select statistics. + * @param size + * Capacity of xstats_names (number of names). + * @return + * - positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - negative value on error: + * -ENODEV for invalid *dev_id* + * -EINVAL for invalid mode, queue port or id parameters + * -ENOTSUP if the device doesn't support this function. + */ +int +rte_event_dev_xstats_names_get(uint8_t dev_id, + enum rte_event_dev_xstats_mode mode, + uint8_t queue_port_id, + struct rte_event_dev_xstats_name *xstats_names, + unsigned int *ids, + unsigned int size); + +/** + * Retrieve extended statistics of an event device. + * + * @param dev_id + * The identifier of the device. + * @param mode + * The mode of statistics to retrieve. Choices include the device statistics, + * port statistics or queue statistics. + * @param queue_port_id + * Used to specify the port or queue number in queue or port mode, and is + * ignored in device mode. + * @param ids + * The id numbers of the stats to get. The ids can be got from the stat + * position in the stat list from rte_event_dev_get_xstats_names(), or + * by using rte_eventdev_get_xstats_by_name() + * @param[out] values + * The values for each stats request by ID. + * @param n + * The number of stats requested + * @return + * - positive value: number of stat entries filled into the values array + * - negative value on error: + * -ENODEV for invalid *dev_id* + * -EINVAL for invalid mode, queue port or id parameters + * -ENOTSUP if the device doesn't support this function. + */ +int +rte_event_dev_xstats_get(uint8_t dev_id, + enum rte_event_dev_xstats_mode mode, + uint8_t queue_port_id, + const unsigned int ids[], + uint64_t values[], unsigned int n); + +/** + * Retrieve the value of a single stat by requesting it by name. + * + * @param dev_id + * The identifier of the device + * @param name + * The stat name to retrieve + * @param[out] id + * If non-NULL, the numerical id of the stat will be returned, so that further + * requests for the stat can be got using rte_eventdev_xstats_get, which will + * be faster as it doesn't need to scan a list of names for the stat. + * If the stat cannot be found, the id returned will be (unsigned)-1. + * @return + * - positive value or zero: the stat value + * - negative value: -EINVAL if stat not found, -ENOTSUP if not supported. + */ +uint64_t +rte_event_dev_xstats_by_name_get(uint8_t dev_id, const char *name, + unsigned int *id); + +/** + * Reset the values of the xstats of the selected component in the device. + * + * @param dev_id + * The identifier of the device + * @param mode + * The mode of the statistics to reset. Choose from device, queue or port. + * @param queue_port_id + * The queue or port to reset. 0 and positive values select ports and queues, + * while -1 indicates all ports or queues. + * @param ids + * Selects specific statistics to be reset. When NULL, all statistics selected + * by *mode* will be reset. If non-NULL, must point to array of at least + * *nb_ids* size. + * @param nb_ids + * The number of ids available from the *ids* array. Ignored when ids is NULL. + * @return + * - zero: successfully reset the statistics to zero + * - negative value: -EINVAL invalid parameters, -ENOTSUP if not supported. + */ +int +rte_event_dev_xstats_reset(uint8_t dev_id, + enum rte_event_dev_xstats_mode mode, + int16_t queue_port_id, + const uint32_t ids[], + uint32_t nb_ids); + +/** + * Trigger the eventdev self test. + * + * @param dev_id + * The identifier of the device + * @return + * - 0: Selftest successful + * - -ENOTSUP if the device doesn't support selftest + * - other values < 0 on failure. + */ +int rte_event_dev_selftest(uint8_t dev_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EVENTDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd.h b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd.h new file mode 100644 index 00000000..3fbb4d2b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd.h @@ -0,0 +1,901 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Cavium, Inc + */ + +#ifndef _RTE_EVENTDEV_PMD_H_ +#define _RTE_EVENTDEV_PMD_H_ + +/** @file + * RTE Event PMD APIs + * + * @note + * These API are from event PMD only and user applications should not call + * them directly. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include +#include + +#include "rte_eventdev.h" +#include "rte_event_timer_adapter_pmd.h" + +/* Logging Macros */ +#define RTE_EDEV_LOG_ERR(...) \ + RTE_LOG(ERR, EVENTDEV, \ + RTE_FMT("%s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,))) + +#ifdef RTE_LIBRTE_EVENTDEV_DEBUG +#define RTE_EDEV_LOG_DEBUG(...) \ + RTE_LOG(DEBUG, EVENTDEV, \ + RTE_FMT("%s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,))) +#else +#define RTE_EDEV_LOG_DEBUG(...) (void)0 +#endif + +/* Macros to check for valid device */ +#define RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \ + if (!rte_event_pmd_is_valid_dev((dev_id))) { \ + RTE_EDEV_LOG_ERR("Invalid dev_id=%d\n", dev_id); \ + return retval; \ + } \ +} while (0) + +#define RTE_EVENTDEV_VALID_DEVID_OR_ERRNO_RET(dev_id, errno, retval) do { \ + if (!rte_event_pmd_is_valid_dev((dev_id))) { \ + RTE_EDEV_LOG_ERR("Invalid dev_id=%d\n", dev_id); \ + rte_errno = errno; \ + return retval; \ + } \ +} while (0) + +#define RTE_EVENTDEV_VALID_DEVID_OR_RET(dev_id) do { \ + if (!rte_event_pmd_is_valid_dev((dev_id))) { \ + RTE_EDEV_LOG_ERR("Invalid dev_id=%d\n", dev_id); \ + return; \ + } \ +} while (0) + +#define RTE_EVENT_ETH_RX_ADAPTER_SW_CAP \ + ((RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID) | \ + (RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ)) + +#define RTE_EVENT_CRYPTO_ADAPTER_SW_CAP \ + RTE_EVENT_CRYPTO_ADAPTER_CAP_SESSION_PRIVATE_DATA + +/**< Ethernet Rx adapter cap to return If the packet transfers from + * the ethdev to eventdev use a SW service function + */ + +#define RTE_EVENTDEV_DETACHED (0) +#define RTE_EVENTDEV_ATTACHED (1) + +struct rte_eth_dev; + +/** Global structure used for maintaining state of allocated event devices */ +struct rte_eventdev_global { + uint8_t nb_devs; /**< Number of devices found */ +}; + +extern struct rte_eventdev_global *rte_eventdev_globals; +/** Pointer to global event devices data structure. */ +extern struct rte_eventdev *rte_eventdevs; +/** The pool of rte_eventdev structures. */ + +/** + * Get the rte_eventdev structure device pointer for the named device. + * + * @param name + * device name to select the device structure. + * + * @return + * - The rte_eventdev structure pointer for the given device ID. + */ +static inline struct rte_eventdev * +rte_event_pmd_get_named_dev(const char *name) +{ + struct rte_eventdev *dev; + unsigned int i; + + if (name == NULL) + return NULL; + + for (i = 0; i < RTE_EVENT_MAX_DEVS; i++) { + dev = &rte_eventdevs[i]; + if ((dev->attached == RTE_EVENTDEV_ATTACHED) && + (strcmp(dev->data->name, name) == 0)) + return dev; + } + + return NULL; +} + +/** + * Validate if the event device index is valid attached event device. + * + * @param dev_id + * Event device index. + * + * @return + * - If the device index is valid (1) or not (0). + */ +static inline unsigned +rte_event_pmd_is_valid_dev(uint8_t dev_id) +{ + struct rte_eventdev *dev; + + if (dev_id >= RTE_EVENT_MAX_DEVS) + return 0; + + dev = &rte_eventdevs[dev_id]; + if (dev->attached != RTE_EVENTDEV_ATTACHED) + return 0; + else + return 1; +} + +/** + * Definitions of all functions exported by a driver through the + * the generic structure of type *event_dev_ops* supplied in the + * *rte_eventdev* structure associated with a device. + */ + +/** + * Get device information of a device. + * + * @param dev + * Event device pointer + * @param dev_info + * Event device information structure + * + * @return + * Returns 0 on success + */ +typedef void (*eventdev_info_get_t)(struct rte_eventdev *dev, + struct rte_event_dev_info *dev_info); + +/** + * Configure a device. + * + * @param dev + * Event device pointer + * + * @return + * Returns 0 on success + */ +typedef int (*eventdev_configure_t)(const struct rte_eventdev *dev); + +/** + * Start a configured device. + * + * @param dev + * Event device pointer + * + * @return + * Returns 0 on success + */ +typedef int (*eventdev_start_t)(struct rte_eventdev *dev); + +/** + * Stop a configured device. + * + * @param dev + * Event device pointer + */ +typedef void (*eventdev_stop_t)(struct rte_eventdev *dev); + +/** + * Close a configured device. + * + * @param dev + * Event device pointer + * + * @return + * - 0 on success + * - (-EAGAIN) if can't close as device is busy + */ +typedef int (*eventdev_close_t)(struct rte_eventdev *dev); + +/** + * Retrieve the default event queue configuration. + * + * @param dev + * Event device pointer + * @param queue_id + * Event queue index + * @param[out] queue_conf + * Event queue configuration structure + * + */ +typedef void (*eventdev_queue_default_conf_get_t)(struct rte_eventdev *dev, + uint8_t queue_id, struct rte_event_queue_conf *queue_conf); + +/** + * Setup an event queue. + * + * @param dev + * Event device pointer + * @param queue_id + * Event queue index + * @param queue_conf + * Event queue configuration structure + * + * @return + * Returns 0 on success. + */ +typedef int (*eventdev_queue_setup_t)(struct rte_eventdev *dev, + uint8_t queue_id, + const struct rte_event_queue_conf *queue_conf); + +/** + * Release resources allocated by given event queue. + * + * @param dev + * Event device pointer + * @param queue_id + * Event queue index + * + */ +typedef void (*eventdev_queue_release_t)(struct rte_eventdev *dev, + uint8_t queue_id); + +/** + * Retrieve the default event port configuration. + * + * @param dev + * Event device pointer + * @param port_id + * Event port index + * @param[out] port_conf + * Event port configuration structure + * + */ +typedef void (*eventdev_port_default_conf_get_t)(struct rte_eventdev *dev, + uint8_t port_id, struct rte_event_port_conf *port_conf); + +/** + * Setup an event port. + * + * @param dev + * Event device pointer + * @param port_id + * Event port index + * @param port_conf + * Event port configuration structure + * + * @return + * Returns 0 on success. + */ +typedef int (*eventdev_port_setup_t)(struct rte_eventdev *dev, + uint8_t port_id, + const struct rte_event_port_conf *port_conf); + +/** + * Release memory resources allocated by given event port. + * + * @param port + * Event port pointer + * + */ +typedef void (*eventdev_port_release_t)(void *port); + +/** + * Link multiple source event queues to destination event port. + * + * @param dev + * Event device pointer + * @param port + * Event port pointer + * @param link + * Points to an array of *nb_links* event queues to be linked + * to the event port. + * @param priorities + * Points to an array of *nb_links* service priorities associated with each + * event queue link to event port. + * @param nb_links + * The number of links to establish + * + * @return + * Returns 0 on success. + * + */ +typedef int (*eventdev_port_link_t)(struct rte_eventdev *dev, void *port, + const uint8_t queues[], const uint8_t priorities[], + uint16_t nb_links); + +/** + * Unlink multiple source event queues from destination event port. + * + * @param dev + * Event device pointer + * @param port + * Event port pointer + * @param queues + * An array of *nb_unlinks* event queues to be unlinked from the event port. + * @param nb_unlinks + * The number of unlinks to establish + * + * @return + * Returns 0 on success. + * + */ +typedef int (*eventdev_port_unlink_t)(struct rte_eventdev *dev, void *port, + uint8_t queues[], uint16_t nb_unlinks); + +/** + * Converts nanoseconds to *timeout_ticks* value for rte_event_dequeue() + * + * @param dev + * Event device pointer + * @param ns + * Wait time in nanosecond + * @param[out] timeout_ticks + * Value for the *timeout_ticks* parameter in rte_event_dequeue() function + * + * @return + * Returns 0 on success. + * + */ +typedef int (*eventdev_dequeue_timeout_ticks_t)(struct rte_eventdev *dev, + uint64_t ns, uint64_t *timeout_ticks); + +/** + * Dump internal information + * + * @param dev + * Event device pointer + * @param f + * A pointer to a file for output + * + */ +typedef void (*eventdev_dump_t)(struct rte_eventdev *dev, FILE *f); + +/** + * Retrieve a set of statistics from device + * + * @param dev + * Event device pointer + * @param ids + * The stat ids to retrieve + * @param values + * The returned stat values + * @param n + * The number of id values and entries in the values array + * @return + * The number of stat values successfully filled into the values array + */ +typedef int (*eventdev_xstats_get_t)(const struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + const unsigned int ids[], uint64_t values[], unsigned int n); + +/** + * Resets the statistic values in xstats for the device, based on mode. + */ +typedef int (*eventdev_xstats_reset_t)(struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, + int16_t queue_port_id, + const uint32_t ids[], + uint32_t nb_ids); + +/** + * Get names of extended stats of an event device + * + * @param dev + * Event device pointer + * @param xstats_names + * Array of name values to be filled in + * @param size + * Number of values in the xstats_names array + * @return + * When size >= the number of stats, return the number of stat values filled + * into the array. + * When size < the number of available stats, return the number of stats + * values, and do not fill in any data into xstats_names. + */ +typedef int (*eventdev_xstats_get_names_t)(const struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + struct rte_event_dev_xstats_name *xstats_names, + unsigned int *ids, unsigned int size); + +/** + * Get value of one stats and optionally return its id + * + * @param dev + * Event device pointer + * @param name + * The name of the stat to retrieve + * @param id + * Pointer to an unsigned int where we store the stat-id for future reference. + * This pointer may be null if the id is not required. + * @return + * The value of the stat, or (uint64_t)-1 if the stat is not found. + * If the stat is not found, the id value will be returned as (unsigned)-1, + * if id pointer is non-NULL + */ +typedef uint64_t (*eventdev_xstats_get_by_name)(const struct rte_eventdev *dev, + const char *name, unsigned int *id); + + +/** + * Retrieve the event device's ethdev Rx adapter capabilities for the + * specified ethernet port + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param[out] caps + * A pointer to memory filled with Rx event adapter capabilities. + * + * @return + * - 0: Success, driver provides Rx event adapter capabilities for the + * ethernet device. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_eth_rx_adapter_caps_get_t) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + uint32_t *caps); + +struct rte_event_eth_rx_adapter_queue_conf *queue_conf; + +/** + * Retrieve the event device's timer adapter capabilities, as well as the ops + * structure that an event timer adapter should call through to enter the + * driver + * + * @param dev + * Event device pointer + * + * @param flags + * Flags that can be used to determine how to select an event timer + * adapter ops structure + * + * @param[out] caps + * A pointer to memory filled with Rx event adapter capabilities. + * + * @param[out] ops + * A pointer to the ops pointer to set with the address of the desired ops + * structure + * + * @return + * - 0: Success, driver provides Rx event adapter capabilities for the + * ethernet device. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_timer_adapter_caps_get_t)( + const struct rte_eventdev *dev, + uint64_t flags, + uint32_t *caps, + const struct rte_event_timer_adapter_ops **ops); + +/** + * Add ethernet Rx queues to event device. This callback is invoked if + * the caps returned from rte_eventdev_eth_rx_adapter_caps_get(, eth_port_id) + * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param rx_queue_id + * Ethernet device receive queue index + * + * @param queue_conf + * Additional configuration structure + + * @return + * - 0: Success, ethernet receive queue added successfully. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_eth_rx_adapter_queue_add_t)( + const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + int32_t rx_queue_id, + const struct rte_event_eth_rx_adapter_queue_conf *queue_conf); + +/** + * Delete ethernet Rx queues from event device. This callback is invoked if + * the caps returned from eventdev_eth_rx_adapter_caps_get(, eth_port_id) + * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param rx_queue_id + * Ethernet device receive queue index + * + * @return + * - 0: Success, ethernet receive queue deleted successfully. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_eth_rx_adapter_queue_del_t) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + int32_t rx_queue_id); + +/** + * Start ethernet Rx adapter. This callback is invoked if + * the caps returned from eventdev_eth_rx_adapter_caps_get(.., eth_port_id) + * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set and Rx queues + * from eth_port_id have been added to the event device. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @return + * - 0: Success, ethernet Rx adapter started successfully. + * - <0: Error code returned by the driver function. + */ +typedef int (*eventdev_eth_rx_adapter_start_t) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev); + +/** + * Stop ethernet Rx adapter. This callback is invoked if + * the caps returned from eventdev_eth_rx_adapter_caps_get(..,eth_port_id) + * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set and Rx queues + * from eth_port_id have been added to the event device. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @return + * - 0: Success, ethernet Rx adapter stopped successfully. + * - <0: Error code returned by the driver function. + */ +typedef int (*eventdev_eth_rx_adapter_stop_t) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev); + +struct rte_event_eth_rx_adapter_stats *stats; + +/** + * Retrieve ethernet Rx adapter statistics. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @param[out] stats + * Pointer to stats structure + * + * @return + * Return 0 on success. + */ + +typedef int (*eventdev_eth_rx_adapter_stats_get) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev, + struct rte_event_eth_rx_adapter_stats *stats); +/** + * Reset ethernet Rx adapter statistics. + * + * @param dev + * Event device pointer + * + * @param eth_dev + * Ethernet device pointer + * + * @return + * Return 0 on success. + */ +typedef int (*eventdev_eth_rx_adapter_stats_reset) + (const struct rte_eventdev *dev, + const struct rte_eth_dev *eth_dev); +/** + * Start eventdev selftest. + * + * @return + * Return 0 on success. + */ +typedef int (*eventdev_selftest)(void); + + +struct rte_cryptodev; + +/** + * This API may change without prior notice + * + * Retrieve the event device's crypto adapter capabilities for the + * specified cryptodev + * + * @param dev + * Event device pointer + * + * @param cdev + * cryptodev pointer + * + * @param[out] caps + * A pointer to memory filled with event adapter capabilities. + * It is expected to be pre-allocated & initialized by caller. + * + * @return + * - 0: Success, driver provides event adapter capabilities for the + * cryptodev. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_crypto_adapter_caps_get_t) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev, + uint32_t *caps); + +/** + * This API may change without prior notice + * + * Add crypto queue pair to event device. This callback is invoked if + * the caps returned from rte_event_crypto_adapter_caps_get(, cdev_id) + * has RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_* set. + * + * @param dev + * Event device pointer + * + * @param cdev + * cryptodev pointer + * + * @param queue_pair_id + * cryptodev queue pair identifier. + * + * @param event + * Event information required for binding cryptodev queue pair to event queue. + * This structure will have a valid value for only those HW PMDs supporting + * @see RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_QP_EV_BIND capability. + * + * @return + * - 0: Success, cryptodev queue pair added successfully. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_crypto_adapter_queue_pair_add_t) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev, + int32_t queue_pair_id, + const struct rte_event *event); + + +/** + * This API may change without prior notice + * + * Delete crypto queue pair to event device. This callback is invoked if + * the caps returned from rte_event_crypto_adapter_caps_get(, cdev_id) + * has RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_* set. + * + * @param dev + * Event device pointer + * + * @param cdev + * cryptodev pointer + * + * @param queue_pair_id + * cryptodev queue pair identifier. + * + * @return + * - 0: Success, cryptodev queue pair deleted successfully. + * - <0: Error code returned by the driver function. + * + */ +typedef int (*eventdev_crypto_adapter_queue_pair_del_t) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev, + int32_t queue_pair_id); + +/** + * Start crypto adapter. This callback is invoked if + * the caps returned from rte_event_crypto_adapter_caps_get(.., cdev_id) + * has RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_* set and queue pairs + * from cdev_id have been added to the event device. + * + * @param dev + * Event device pointer + * + * @param cdev + * Crypto device pointer + * + * @return + * - 0: Success, crypto adapter started successfully. + * - <0: Error code returned by the driver function. + */ +typedef int (*eventdev_crypto_adapter_start_t) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev); + +/** + * Stop crypto adapter. This callback is invoked if + * the caps returned from rte_event_crypto_adapter_caps_get(.., cdev_id) + * has RTE_EVENT_CRYPTO_ADAPTER_CAP_INTERNAL_PORT_* set and queue pairs + * from cdev_id have been added to the event device. + * + * @param dev + * Event device pointer + * + * @param cdev + * Crypto device pointer + * + * @return + * - 0: Success, crypto adapter stopped successfully. + * - <0: Error code returned by the driver function. + */ +typedef int (*eventdev_crypto_adapter_stop_t) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev); + +struct rte_event_crypto_adapter_stats; + +/** + * Retrieve crypto adapter statistics. + * + * @param dev + * Event device pointer + * + * @param cdev + * Crypto device pointer + * + * @param[out] stats + * Pointer to stats structure + * + * @return + * Return 0 on success. + */ + +typedef int (*eventdev_crypto_adapter_stats_get) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev, + struct rte_event_crypto_adapter_stats *stats); + +/** + * Reset crypto adapter statistics. + * + * @param dev + * Event device pointer + * + * @param cdev + * Crypto device pointer + * + * @return + * Return 0 on success. + */ + +typedef int (*eventdev_crypto_adapter_stats_reset) + (const struct rte_eventdev *dev, + const struct rte_cryptodev *cdev); + +/** Event device operations function pointer table */ +struct rte_eventdev_ops { + eventdev_info_get_t dev_infos_get; /**< Get device info. */ + eventdev_configure_t dev_configure; /**< Configure device. */ + eventdev_start_t dev_start; /**< Start device. */ + eventdev_stop_t dev_stop; /**< Stop device. */ + eventdev_close_t dev_close; /**< Close device. */ + + eventdev_queue_default_conf_get_t queue_def_conf; + /**< Get default queue configuration. */ + eventdev_queue_setup_t queue_setup; + /**< Set up an event queue. */ + eventdev_queue_release_t queue_release; + /**< Release an event queue. */ + + eventdev_port_default_conf_get_t port_def_conf; + /**< Get default port configuration. */ + eventdev_port_setup_t port_setup; + /**< Set up an event port. */ + eventdev_port_release_t port_release; + /**< Release an event port. */ + + eventdev_port_link_t port_link; + /**< Link event queues to an event port. */ + eventdev_port_unlink_t port_unlink; + /**< Unlink event queues from an event port. */ + eventdev_dequeue_timeout_ticks_t timeout_ticks; + /**< Converts ns to *timeout_ticks* value for rte_event_dequeue() */ + eventdev_dump_t dump; + /* Dump internal information */ + + eventdev_xstats_get_t xstats_get; + /**< Get extended device statistics. */ + eventdev_xstats_get_names_t xstats_get_names; + /**< Get names of extended stats. */ + eventdev_xstats_get_by_name xstats_get_by_name; + /**< Get one value by name. */ + eventdev_xstats_reset_t xstats_reset; + /**< Reset the statistics values in xstats. */ + + eventdev_eth_rx_adapter_caps_get_t eth_rx_adapter_caps_get; + /**< Get ethernet Rx adapter capabilities */ + eventdev_eth_rx_adapter_queue_add_t eth_rx_adapter_queue_add; + /**< Add Rx queues to ethernet Rx adapter */ + eventdev_eth_rx_adapter_queue_del_t eth_rx_adapter_queue_del; + /**< Delete Rx queues from ethernet Rx adapter */ + eventdev_eth_rx_adapter_start_t eth_rx_adapter_start; + /**< Start ethernet Rx adapter */ + eventdev_eth_rx_adapter_stop_t eth_rx_adapter_stop; + /**< Stop ethernet Rx adapter */ + eventdev_eth_rx_adapter_stats_get eth_rx_adapter_stats_get; + /**< Get ethernet Rx stats */ + eventdev_eth_rx_adapter_stats_reset eth_rx_adapter_stats_reset; + /**< Reset ethernet Rx stats */ + + eventdev_timer_adapter_caps_get_t timer_adapter_caps_get; + /**< Get timer adapter capabilities */ + + eventdev_crypto_adapter_caps_get_t crypto_adapter_caps_get; + /**< Get crypto adapter capabilities */ + eventdev_crypto_adapter_queue_pair_add_t crypto_adapter_queue_pair_add; + /**< Add queue pair to crypto adapter */ + eventdev_crypto_adapter_queue_pair_del_t crypto_adapter_queue_pair_del; + /**< Delete queue pair from crypto adapter */ + eventdev_crypto_adapter_start_t crypto_adapter_start; + /**< Start crypto adapter */ + eventdev_crypto_adapter_stop_t crypto_adapter_stop; + /**< Stop crypto adapter */ + eventdev_crypto_adapter_stats_get crypto_adapter_stats_get; + /**< Get crypto stats */ + eventdev_crypto_adapter_stats_reset crypto_adapter_stats_reset; + /**< Reset crypto stats */ + + eventdev_selftest dev_selftest; + /**< Start eventdev Selftest */ + + eventdev_stop_flush_t dev_stop_flush; + /**< User-provided event flush function */ +}; + +/** + * Allocates a new eventdev slot for an event device and returns the pointer + * to that slot for the driver to use. + * + * @param name + * Unique identifier name for each device + * @param socket_id + * Socket to allocate resources on. + * @return + * - Slot in the rte_dev_devices array for a new device; + */ +struct rte_eventdev * +rte_event_pmd_allocate(const char *name, int socket_id); + +/** + * Release the specified eventdev device. + * + * @param eventdev + * The *eventdev* pointer is the address of the *rte_eventdev* structure. + * @return + * - 0 on success, negative on error + */ +int +rte_event_pmd_release(struct rte_eventdev *eventdev); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EVENTDEV_PMD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h new file mode 100644 index 00000000..8fb61386 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Cavium, Inc + */ + +#ifndef _RTE_EVENTDEV_PMD_PCI_H_ +#define _RTE_EVENTDEV_PMD_PCI_H_ + +/** @file + * RTE Eventdev PCI PMD APIs + * + * @note + * These API are from event PCI PMD only and user applications should not call + * them directly. + */ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include +#include + +#include "rte_eventdev_pmd.h" + +typedef int (*eventdev_pmd_pci_callback_t)(struct rte_eventdev *dev); + +/** + * @internal + * Wrapper for use by pci drivers as a .probe function to attach to a event + * interface. + */ +static inline int +rte_event_pmd_pci_probe(struct rte_pci_driver *pci_drv, + struct rte_pci_device *pci_dev, + size_t private_data_size, + eventdev_pmd_pci_callback_t devinit) +{ + struct rte_eventdev *eventdev; + + char eventdev_name[RTE_EVENTDEV_NAME_MAX_LEN]; + + int retval; + + if (devinit == NULL) + return -EINVAL; + + rte_pci_device_name(&pci_dev->addr, eventdev_name, + sizeof(eventdev_name)); + + eventdev = rte_event_pmd_allocate(eventdev_name, + pci_dev->device.numa_node); + if (eventdev == NULL) + return -ENOMEM; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + eventdev->data->dev_private = + rte_zmalloc_socket( + "eventdev private structure", + private_data_size, + RTE_CACHE_LINE_SIZE, + rte_socket_id()); + + if (eventdev->data->dev_private == NULL) + rte_panic("Cannot allocate memzone for private " + "device data"); + } + + eventdev->dev = &pci_dev->device; + + /* Invoke PMD device initialization function */ + retval = devinit(eventdev); + if (retval == 0) + return 0; + + RTE_EDEV_LOG_ERR("driver %s: (vendor_id=0x%x device_id=0x%x)" + " failed", pci_drv->driver.name, + (unsigned int) pci_dev->id.vendor_id, + (unsigned int) pci_dev->id.device_id); + + rte_event_pmd_release(eventdev); + + return -ENXIO; +} + + +/** + * @internal + * Wrapper for use by pci drivers as a .remove function to detach a event + * interface. + */ +static inline int +rte_event_pmd_pci_remove(struct rte_pci_device *pci_dev, + eventdev_pmd_pci_callback_t devuninit) +{ + struct rte_eventdev *eventdev; + char eventdev_name[RTE_EVENTDEV_NAME_MAX_LEN]; + int ret = 0; + + if (pci_dev == NULL) + return -EINVAL; + + rte_pci_device_name(&pci_dev->addr, eventdev_name, + sizeof(eventdev_name)); + + eventdev = rte_event_pmd_get_named_dev(eventdev_name); + if (eventdev == NULL) + return -ENODEV; + + ret = rte_event_dev_close(eventdev->data->dev_id); + if (ret < 0) + return ret; + + /* Invoke PMD device un-init function */ + if (devuninit) + ret = devuninit(eventdev); + if (ret) + return ret; + + /* Free event device */ + rte_event_pmd_release(eventdev); + + eventdev->dev = NULL; + + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EVENTDEV_PMD_PCI_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_vdev.h b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_vdev.h new file mode 100644 index 00000000..8c64a067 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_pmd_vdev.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2017 Cavium, Inc + */ + +#ifndef _RTE_EVENTDEV_PMD_VDEV_H_ +#define _RTE_EVENTDEV_PMD_VDEV_H_ + +/** @file + * RTE Eventdev VDEV PMD APIs + * + * @note + * These API are from event VDEV PMD only and user applications should not call + * them directly. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include + +#include "rte_eventdev_pmd.h" + +/** + * @internal + * Creates a new virtual event device and returns the pointer to that device. + * + * @param name + * PMD type name + * @param dev_private_size + * Size of event PMDs private data + * @param socket_id + * Socket to allocate resources on. + * + * @return + * - Eventdev pointer if device is successfully created. + * - NULL if device cannot be created. + */ +static inline struct rte_eventdev * +rte_event_pmd_vdev_init(const char *name, size_t dev_private_size, + int socket_id) +{ + + struct rte_eventdev *eventdev; + + /* Allocate device structure */ + eventdev = rte_event_pmd_allocate(name, socket_id); + if (eventdev == NULL) + return NULL; + + /* Allocate private device structure */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + eventdev->data->dev_private = + rte_zmalloc_socket("eventdev device private", + dev_private_size, + RTE_CACHE_LINE_SIZE, + socket_id); + + if (eventdev->data->dev_private == NULL) + rte_panic("Cannot allocate memzone for private device" + " data"); + } + + return eventdev; +} + +/** + * @internal + * Destroy the given virtual event device + * + * @param name + * PMD type name + * @return + * - 0 on success, negative on error + */ +static inline int +rte_event_pmd_vdev_uninit(const char *name) +{ + int ret; + struct rte_eventdev *eventdev; + + if (name == NULL) + return -EINVAL; + + eventdev = rte_event_pmd_get_named_dev(name); + if (eventdev == NULL) + return -ENODEV; + + ret = rte_event_dev_close(eventdev->data->dev_id); + if (ret < 0) + return ret; + + /* Free the event device */ + rte_event_pmd_release(eventdev); + + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_EVENTDEV_PMD_VDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_version.map b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_version.map new file mode 100644 index 00000000..12835e9f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eventdev/rte_eventdev_version.map @@ -0,0 +1,113 @@ +DPDK_17.05 { + global: + + rte_eventdevs; + + rte_event_dev_count; + rte_event_dev_get_dev_id; + rte_event_dev_socket_id; + rte_event_dev_info_get; + rte_event_dev_configure; + rte_event_dev_start; + rte_event_dev_stop; + rte_event_dev_close; + rte_event_dev_dump; + rte_event_dev_xstats_by_name_get; + rte_event_dev_xstats_get; + rte_event_dev_xstats_names_get; + rte_event_dev_xstats_reset; + + rte_event_port_default_conf_get; + rte_event_port_setup; + rte_event_port_link; + rte_event_port_unlink; + rte_event_port_links_get; + + rte_event_queue_default_conf_get; + rte_event_queue_setup; + + rte_event_dequeue_timeout_ticks; + + rte_event_pmd_allocate; + rte_event_pmd_release; + rte_event_pmd_vdev_init; + rte_event_pmd_vdev_uninit; + rte_event_pmd_pci_probe; + rte_event_pmd_pci_remove; + + local: *; +}; + +DPDK_17.08 { + global: + + rte_event_ring_create; + rte_event_ring_free; + rte_event_ring_init; + rte_event_ring_lookup; +} DPDK_17.05; + +DPDK_17.11 { + global: + + rte_event_dev_attr_get; + rte_event_dev_service_id_get; + rte_event_port_attr_get; + rte_event_queue_attr_get; + + rte_event_eth_rx_adapter_caps_get; + rte_event_eth_rx_adapter_create; + rte_event_eth_rx_adapter_create_ext; + rte_event_eth_rx_adapter_free; + rte_event_eth_rx_adapter_queue_add; + rte_event_eth_rx_adapter_queue_del; + rte_event_eth_rx_adapter_service_id_get; + rte_event_eth_rx_adapter_start; + rte_event_eth_rx_adapter_stats_get; + rte_event_eth_rx_adapter_stats_reset; + rte_event_eth_rx_adapter_stop; +} DPDK_17.08; + +DPDK_18.02 { + global: + + rte_event_dev_selftest; +} DPDK_17.11; + +DPDK_18.05 { + global: + + rte_event_dev_stop_flush_callback_register; +} DPDK_18.02; + +EXPERIMENTAL { + global: + + rte_event_crypto_adapter_caps_get; + rte_event_crypto_adapter_create; + rte_event_crypto_adapter_create_ext; + rte_event_crypto_adapter_event_port_get; + rte_event_crypto_adapter_free; + rte_event_crypto_adapter_queue_pair_add; + rte_event_crypto_adapter_queue_pair_del; + rte_event_crypto_adapter_service_id_get; + rte_event_crypto_adapter_start; + rte_event_crypto_adapter_stats_get; + rte_event_crypto_adapter_stats_reset; + rte_event_crypto_adapter_stop; + rte_event_eth_rx_adapter_cb_register; + rte_event_timer_adapter_caps_get; + rte_event_timer_adapter_create; + rte_event_timer_adapter_create_ext; + rte_event_timer_adapter_free; + rte_event_timer_adapter_get_info; + rte_event_timer_adapter_lookup; + rte_event_timer_adapter_service_id_get; + rte_event_timer_adapter_start; + rte_event_timer_adapter_stats_get; + rte_event_timer_adapter_stats_reset; + rte_event_timer_adapter_stop; + rte_event_timer_arm_burst; + rte_event_timer_arm_tmo_tick_burst; + rte_event_timer_cancel_burst; +}; diff --git a/src/spdk/dpdk/lib/librte_flow_classify/Makefile b/src/spdk/dpdk/lib/librte_flow_classify/Makefile new file mode 100644 index 00000000..fe9fc47a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/Makefile @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_flow_classify.a + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) + +EXPORT_MAP := rte_flow_classify_version.map + +LIBABIVER := 1 + +LDLIBS += -lrte_eal -lrte_ethdev -lrte_net -lrte_table -lrte_acl + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += rte_flow_classify.c +SRCS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += rte_flow_classify_parse.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY)-include := rte_flow_classify.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_flow_classify/meson.build b/src/spdk/dpdk/lib/librte_flow_classify/meson.build new file mode 100644 index 00000000..d7e48747 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +allow_experimental_apis = true +sources = files('rte_flow_classify.c', 'rte_flow_classify_parse.c') +headers = files('rte_flow_classify.h') +deps += ['net', 'table'] diff --git a/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.c b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.c new file mode 100644 index 00000000..4c3469da --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.c @@ -0,0 +1,682 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include "rte_flow_classify_parse.h" +#include +#include +#include + +int librte_flow_classify_logtype; + +static uint32_t unique_id = 1; + +enum rte_flow_classify_table_type table_type + = RTE_FLOW_CLASSIFY_TABLE_TYPE_NONE; + +struct rte_flow_classify_table_entry { + /* meta-data for classify rule */ + uint32_t rule_id; + + /* Flow action */ + struct classify_action action; +}; + +struct rte_cls_table { + /* Input parameters */ + struct rte_table_ops ops; + uint32_t entry_size; + enum rte_flow_classify_table_type type; + + /* Handle to the low-level table object */ + void *h_table; +}; + +#define RTE_FLOW_CLASSIFIER_MAX_NAME_SZ 256 + +struct rte_flow_classifier { + /* Input parameters */ + char name[RTE_FLOW_CLASSIFIER_MAX_NAME_SZ]; + int socket_id; + + /* Internal */ + /* ntuple_filter */ + struct rte_eth_ntuple_filter ntuple_filter; + + /* classifier tables */ + struct rte_cls_table tables[RTE_FLOW_CLASSIFY_TABLE_MAX]; + uint32_t table_mask; + uint32_t num_tables; + + uint16_t nb_pkts; + struct rte_flow_classify_table_entry + *entries[RTE_PORT_IN_BURST_SIZE_MAX]; +} __rte_cache_aligned; + +enum { + PROTO_FIELD_IPV4, + SRC_FIELD_IPV4, + DST_FIELD_IPV4, + SRCP_FIELD_IPV4, + DSTP_FIELD_IPV4, + NUM_FIELDS_IPV4 +}; + +struct acl_keys { + struct rte_table_acl_rule_add_params key_add; /* add key */ + struct rte_table_acl_rule_delete_params key_del; /* delete key */ +}; + +struct classify_rules { + enum rte_flow_classify_rule_type type; + union { + struct rte_flow_classify_ipv4_5tuple ipv4_5tuple; + } u; +}; + +struct rte_flow_classify_rule { + uint32_t id; /* unique ID of classify rule */ + enum rte_flow_classify_table_type tbl_type; /* rule table */ + struct classify_rules rules; /* union of rules */ + union { + struct acl_keys key; + } u; + int key_found; /* rule key found in table */ + struct rte_flow_classify_table_entry entry; /* rule meta data */ + void *entry_ptr; /* handle to the table entry for rule meta data */ +}; + +int __rte_experimental +rte_flow_classify_validate( + struct rte_flow_classifier *cls, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct rte_flow_item *items; + parse_filter_t parse_filter; + uint32_t item_num = 0; + uint32_t i = 0; + int ret; + + if (error == NULL) + return -EINVAL; + + if (cls == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: rte_flow_classifier parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (!attr) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR, + NULL, "NULL attribute."); + return -EINVAL; + } + + if (!pattern) { + rte_flow_error_set(error, + EINVAL, RTE_FLOW_ERROR_TYPE_ITEM_NUM, + NULL, "NULL pattern."); + return -EINVAL; + } + + if (!actions) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_NUM, + NULL, "NULL action."); + return -EINVAL; + } + + memset(&cls->ntuple_filter, 0, sizeof(cls->ntuple_filter)); + + /* Get the non-void item number of pattern */ + while ((pattern + i)->type != RTE_FLOW_ITEM_TYPE_END) { + if ((pattern + i)->type != RTE_FLOW_ITEM_TYPE_VOID) + item_num++; + i++; + } + item_num++; + + items = malloc(item_num * sizeof(struct rte_flow_item)); + if (!items) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_ITEM_NUM, + NULL, "No memory for pattern items."); + return -ENOMEM; + } + + memset(items, 0, item_num * sizeof(struct rte_flow_item)); + classify_pattern_skip_void_item(items, pattern); + + parse_filter = classify_find_parse_filter_func(items); + if (!parse_filter) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + pattern, "Unsupported pattern"); + free(items); + return -EINVAL; + } + + ret = parse_filter(attr, items, actions, &cls->ntuple_filter, error); + free(items); + return ret; +} + + +#define uint32_t_to_char(ip, a, b, c, d) do {\ + *a = (unsigned char)(ip >> 24 & 0xff);\ + *b = (unsigned char)(ip >> 16 & 0xff);\ + *c = (unsigned char)(ip >> 8 & 0xff);\ + *d = (unsigned char)(ip & 0xff);\ + } while (0) + +static inline void +print_acl_ipv4_key_add(struct rte_table_acl_rule_add_params *key) +{ + unsigned char a, b, c, d; + + printf("%s: 0x%02hhx/0x%hhx ", __func__, + key->field_value[PROTO_FIELD_IPV4].value.u8, + key->field_value[PROTO_FIELD_IPV4].mask_range.u8); + + uint32_t_to_char(key->field_value[SRC_FIELD_IPV4].value.u32, + &a, &b, &c, &d); + printf(" %hhu.%hhu.%hhu.%hhu/0x%x ", a, b, c, d, + key->field_value[SRC_FIELD_IPV4].mask_range.u32); + + uint32_t_to_char(key->field_value[DST_FIELD_IPV4].value.u32, + &a, &b, &c, &d); + printf("%hhu.%hhu.%hhu.%hhu/0x%x ", a, b, c, d, + key->field_value[DST_FIELD_IPV4].mask_range.u32); + + printf("%hu : 0x%x %hu : 0x%x", + key->field_value[SRCP_FIELD_IPV4].value.u16, + key->field_value[SRCP_FIELD_IPV4].mask_range.u16, + key->field_value[DSTP_FIELD_IPV4].value.u16, + key->field_value[DSTP_FIELD_IPV4].mask_range.u16); + + printf(" priority: 0x%x\n", key->priority); +} + +static inline void +print_acl_ipv4_key_delete(struct rte_table_acl_rule_delete_params *key) +{ + unsigned char a, b, c, d; + + printf("%s: 0x%02hhx/0x%hhx ", __func__, + key->field_value[PROTO_FIELD_IPV4].value.u8, + key->field_value[PROTO_FIELD_IPV4].mask_range.u8); + + uint32_t_to_char(key->field_value[SRC_FIELD_IPV4].value.u32, + &a, &b, &c, &d); + printf(" %hhu.%hhu.%hhu.%hhu/0x%x ", a, b, c, d, + key->field_value[SRC_FIELD_IPV4].mask_range.u32); + + uint32_t_to_char(key->field_value[DST_FIELD_IPV4].value.u32, + &a, &b, &c, &d); + printf("%hhu.%hhu.%hhu.%hhu/0x%x ", a, b, c, d, + key->field_value[DST_FIELD_IPV4].mask_range.u32); + + printf("%hu : 0x%x %hu : 0x%x\n", + key->field_value[SRCP_FIELD_IPV4].value.u16, + key->field_value[SRCP_FIELD_IPV4].mask_range.u16, + key->field_value[DSTP_FIELD_IPV4].value.u16, + key->field_value[DSTP_FIELD_IPV4].mask_range.u16); +} + +static int +rte_flow_classifier_check_params(struct rte_flow_classifier_params *params) +{ + if (params == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: Incorrect value for parameter params\n", __func__); + return -EINVAL; + } + + /* name */ + if (params->name == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: Incorrect value for parameter name\n", __func__); + return -EINVAL; + } + + /* socket */ + if ((params->socket_id < 0) || + (params->socket_id >= RTE_MAX_NUMA_NODES)) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: Incorrect value for parameter socket_id\n", + __func__); + return -EINVAL; + } + + return 0; +} + +struct rte_flow_classifier * __rte_experimental +rte_flow_classifier_create(struct rte_flow_classifier_params *params) +{ + struct rte_flow_classifier *cls; + int ret; + + /* Check input parameters */ + ret = rte_flow_classifier_check_params(params); + if (ret != 0) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: flow classifier params check failed (%d)\n", + __func__, ret); + return NULL; + } + + /* Allocate memory for the flow classifier */ + cls = rte_zmalloc_socket("FLOW_CLASSIFIER", + sizeof(struct rte_flow_classifier), + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (cls == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: flow classifier memory allocation failed\n", + __func__); + return NULL; + } + + /* Save input parameters */ + snprintf(cls->name, RTE_FLOW_CLASSIFIER_MAX_NAME_SZ, "%s", + params->name); + + cls->socket_id = params->socket_id; + + return cls; +} + +static void +rte_flow_classify_table_free(struct rte_cls_table *table) +{ + if (table->ops.f_free != NULL) + table->ops.f_free(table->h_table); +} + +int __rte_experimental +rte_flow_classifier_free(struct rte_flow_classifier *cls) +{ + uint32_t i; + + /* Check input parameters */ + if (cls == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: rte_flow_classifier parameter is NULL\n", + __func__); + return -EINVAL; + } + + /* Free tables */ + for (i = 0; i < cls->num_tables; i++) { + struct rte_cls_table *table = &cls->tables[i]; + + rte_flow_classify_table_free(table); + } + + /* Free flow classifier memory */ + rte_free(cls); + + return 0; +} + +static int +rte_table_check_params(struct rte_flow_classifier *cls, + struct rte_flow_classify_table_params *params) +{ + if (cls == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: flow classifier parameter is NULL\n", + __func__); + return -EINVAL; + } + if (params == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, "%s: params parameter is NULL\n", + __func__); + return -EINVAL; + } + + /* ops */ + if (params->ops == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, "%s: params->ops is NULL\n", + __func__); + return -EINVAL; + } + + if (params->ops->f_create == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: f_create function pointer is NULL\n", __func__); + return -EINVAL; + } + + if (params->ops->f_lookup == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: f_lookup function pointer is NULL\n", __func__); + return -EINVAL; + } + + /* De we have room for one more table? */ + if (cls->num_tables == RTE_FLOW_CLASSIFY_TABLE_MAX) { + RTE_FLOW_CLASSIFY_LOG(ERR, + "%s: Incorrect value for num_tables parameter\n", + __func__); + return -EINVAL; + } + + return 0; +} + +int __rte_experimental +rte_flow_classify_table_create(struct rte_flow_classifier *cls, + struct rte_flow_classify_table_params *params) +{ + struct rte_cls_table *table; + void *h_table; + uint32_t entry_size; + int ret; + + /* Check input arguments */ + ret = rte_table_check_params(cls, params); + if (ret != 0) + return ret; + + /* calculate table entry size */ + entry_size = sizeof(struct rte_flow_classify_table_entry); + + /* Create the table */ + h_table = params->ops->f_create(params->arg_create, cls->socket_id, + entry_size); + if (h_table == NULL) { + RTE_FLOW_CLASSIFY_LOG(ERR, "%s: Table creation failed\n", + __func__); + return -EINVAL; + } + + /* Commit current table to the classifier */ + table = &cls->tables[cls->num_tables]; + table->type = params->type; + cls->num_tables++; + + /* Save input parameters */ + memcpy(&table->ops, params->ops, sizeof(struct rte_table_ops)); + + /* Initialize table internal data structure */ + table->entry_size = entry_size; + table->h_table = h_table; + + return 0; +} + +static struct rte_flow_classify_rule * +allocate_acl_ipv4_5tuple_rule(struct rte_flow_classifier *cls) +{ + struct rte_flow_classify_rule *rule; + int log_level; + + rule = malloc(sizeof(struct rte_flow_classify_rule)); + if (!rule) + return rule; + + memset(rule, 0, sizeof(struct rte_flow_classify_rule)); + rule->id = unique_id++; + rule->rules.type = RTE_FLOW_CLASSIFY_RULE_TYPE_IPV4_5TUPLE; + + /* key add values */ + rule->u.key.key_add.priority = cls->ntuple_filter.priority; + rule->u.key.key_add.field_value[PROTO_FIELD_IPV4].mask_range.u8 = + cls->ntuple_filter.proto_mask; + rule->u.key.key_add.field_value[PROTO_FIELD_IPV4].value.u8 = + cls->ntuple_filter.proto; + rule->rules.u.ipv4_5tuple.proto = cls->ntuple_filter.proto; + rule->rules.u.ipv4_5tuple.proto_mask = cls->ntuple_filter.proto_mask; + + rule->u.key.key_add.field_value[SRC_FIELD_IPV4].mask_range.u32 = + cls->ntuple_filter.src_ip_mask; + rule->u.key.key_add.field_value[SRC_FIELD_IPV4].value.u32 = + cls->ntuple_filter.src_ip; + rule->rules.u.ipv4_5tuple.src_ip_mask = cls->ntuple_filter.src_ip_mask; + rule->rules.u.ipv4_5tuple.src_ip = cls->ntuple_filter.src_ip; + + rule->u.key.key_add.field_value[DST_FIELD_IPV4].mask_range.u32 = + cls->ntuple_filter.dst_ip_mask; + rule->u.key.key_add.field_value[DST_FIELD_IPV4].value.u32 = + cls->ntuple_filter.dst_ip; + rule->rules.u.ipv4_5tuple.dst_ip_mask = cls->ntuple_filter.dst_ip_mask; + rule->rules.u.ipv4_5tuple.dst_ip = cls->ntuple_filter.dst_ip; + + rule->u.key.key_add.field_value[SRCP_FIELD_IPV4].mask_range.u16 = + cls->ntuple_filter.src_port_mask; + rule->u.key.key_add.field_value[SRCP_FIELD_IPV4].value.u16 = + cls->ntuple_filter.src_port; + rule->rules.u.ipv4_5tuple.src_port_mask = + cls->ntuple_filter.src_port_mask; + rule->rules.u.ipv4_5tuple.src_port = cls->ntuple_filter.src_port; + + rule->u.key.key_add.field_value[DSTP_FIELD_IPV4].mask_range.u16 = + cls->ntuple_filter.dst_port_mask; + rule->u.key.key_add.field_value[DSTP_FIELD_IPV4].value.u16 = + cls->ntuple_filter.dst_port; + rule->rules.u.ipv4_5tuple.dst_port_mask = + cls->ntuple_filter.dst_port_mask; + rule->rules.u.ipv4_5tuple.dst_port = cls->ntuple_filter.dst_port; + + log_level = rte_log_get_level(librte_flow_classify_logtype); + + if (log_level == RTE_LOG_DEBUG) + print_acl_ipv4_key_add(&rule->u.key.key_add); + + /* key delete values */ + memcpy(&rule->u.key.key_del.field_value[PROTO_FIELD_IPV4], + &rule->u.key.key_add.field_value[PROTO_FIELD_IPV4], + NUM_FIELDS_IPV4 * sizeof(struct rte_acl_field)); + + if (log_level == RTE_LOG_DEBUG) + print_acl_ipv4_key_delete(&rule->u.key.key_del); + + return rule; +} + +struct rte_flow_classify_rule * __rte_experimental +rte_flow_classify_table_entry_add(struct rte_flow_classifier *cls, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + int *key_found, + struct rte_flow_error *error) +{ + struct rte_flow_classify_rule *rule; + struct rte_flow_classify_table_entry *table_entry; + struct classify_action *action; + uint32_t i; + int ret; + + if (!error) + return NULL; + + if (key_found == NULL) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "NULL key_found."); + return NULL; + } + + /* parse attr, pattern and actions */ + ret = rte_flow_classify_validate(cls, attr, pattern, actions, error); + if (ret < 0) + return NULL; + + switch (table_type) { + case RTE_FLOW_CLASSIFY_TABLE_ACL_IP4_5TUPLE: + rule = allocate_acl_ipv4_5tuple_rule(cls); + if (!rule) + return NULL; + rule->tbl_type = table_type; + cls->table_mask |= table_type; + break; + default: + return NULL; + } + + action = classify_get_flow_action(); + table_entry = &rule->entry; + table_entry->rule_id = rule->id; + table_entry->action.action_mask = action->action_mask; + + /* Copy actions */ + if (action->action_mask & (1LLU << RTE_FLOW_ACTION_TYPE_COUNT)) { + memcpy(&table_entry->action.act.counter, &action->act.counter, + sizeof(table_entry->action.act.counter)); + } + if (action->action_mask & (1LLU << RTE_FLOW_ACTION_TYPE_MARK)) { + memcpy(&table_entry->action.act.mark, &action->act.mark, + sizeof(table_entry->action.act.mark)); + } + + for (i = 0; i < cls->num_tables; i++) { + struct rte_cls_table *table = &cls->tables[i]; + + if (table->type == table_type) { + if (table->ops.f_add != NULL) { + ret = table->ops.f_add( + table->h_table, + &rule->u.key.key_add, + &rule->entry, + &rule->key_found, + &rule->entry_ptr); + if (ret) { + free(rule); + return NULL; + } + + *key_found = rule->key_found; + } + + return rule; + } + } + free(rule); + return NULL; +} + +int __rte_experimental +rte_flow_classify_table_entry_delete(struct rte_flow_classifier *cls, + struct rte_flow_classify_rule *rule) +{ + uint32_t i; + int ret = -EINVAL; + + if (!cls || !rule) + return ret; + enum rte_flow_classify_table_type tbl_type = rule->tbl_type; + + for (i = 0; i < cls->num_tables; i++) { + struct rte_cls_table *table = &cls->tables[i]; + + if (table->type == tbl_type) { + if (table->ops.f_delete != NULL) { + ret = table->ops.f_delete(table->h_table, + &rule->u.key.key_del, + &rule->key_found, + &rule->entry); + + return ret; + } + } + } + free(rule); + return ret; +} + +static int +flow_classifier_lookup(struct rte_flow_classifier *cls, + struct rte_cls_table *table, + struct rte_mbuf **pkts, + const uint16_t nb_pkts) +{ + int ret = -EINVAL; + uint64_t pkts_mask; + uint64_t lookup_hit_mask; + + pkts_mask = RTE_LEN2MASK(nb_pkts, uint64_t); + ret = table->ops.f_lookup(table->h_table, + pkts, pkts_mask, &lookup_hit_mask, + (void **)cls->entries); + + if (!ret && lookup_hit_mask) + cls->nb_pkts = nb_pkts; + else + cls->nb_pkts = 0; + + return ret; +} + +static int +action_apply(struct rte_flow_classifier *cls, + struct rte_flow_classify_rule *rule, + struct rte_flow_classify_stats *stats) +{ + struct rte_flow_classify_ipv4_5tuple_stats *ntuple_stats; + struct rte_flow_classify_table_entry *entry = &rule->entry; + uint64_t count = 0; + uint32_t action_mask = entry->action.action_mask; + int i, ret = -EINVAL; + + if (action_mask & (1LLU << RTE_FLOW_ACTION_TYPE_COUNT)) { + for (i = 0; i < cls->nb_pkts; i++) { + if (rule->id == cls->entries[i]->rule_id) + count++; + } + if (count) { + ret = 0; + ntuple_stats = stats->stats; + ntuple_stats->counter1 = count; + ntuple_stats->ipv4_5tuple = rule->rules.u.ipv4_5tuple; + } + } + return ret; +} + +int __rte_experimental +rte_flow_classifier_query(struct rte_flow_classifier *cls, + struct rte_mbuf **pkts, + const uint16_t nb_pkts, + struct rte_flow_classify_rule *rule, + struct rte_flow_classify_stats *stats) +{ + enum rte_flow_classify_table_type tbl_type; + uint32_t i; + int ret = -EINVAL; + + if (!cls || !rule || !stats || !pkts || nb_pkts == 0) + return ret; + + tbl_type = rule->tbl_type; + for (i = 0; i < cls->num_tables; i++) { + struct rte_cls_table *table = &cls->tables[i]; + + if (table->type == tbl_type) { + ret = flow_classifier_lookup(cls, table, + pkts, nb_pkts); + if (!ret) { + ret = action_apply(cls, rule, stats); + return ret; + } + } + } + return ret; +} + +RTE_INIT(librte_flow_classify_init_log) +{ + librte_flow_classify_logtype = + rte_log_register("lib.flow_classify"); + if (librte_flow_classify_logtype >= 0) + rte_log_set_level(librte_flow_classify_logtype, RTE_LOG_INFO); +} diff --git a/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.h b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.h new file mode 100644 index 00000000..56e06353 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_FLOW_CLASSIFY_H_ +#define _RTE_FLOW_CLASSIFY_H_ + +/** + * @file + * + * RTE Flow Classify Library + * + * @b EXPERIMENTAL: this API may change without prior notice + * + * This library provides flow record information with some measured properties. + * + * Application should define the flow and measurement criteria (action) for it. + * + * The Library doesn't maintain any flow records itself, instead flow + * information is returned to upper layer only for given packets. + * + * It is application's responsibility to call rte_flow_classifier_query() + * for a burst of packets, just after receiving them or before transmitting + * them. + * Application should provide the flow type interested in, measurement to apply + * to that flow in rte_flow_classify_table_entry_add() API, and should provide + * the rte_flow_classifier object and storage to put results in for the + * rte_flow_classifier_query() API. + * + * Usage: + * - application calls rte_flow_classifier_create() to create an + * rte_flow_classifier object. + * - application calls rte_flow_classify_table_create() to create a table + * in the rte_flow_classifier object. + * - application calls rte_flow_classify_table_entry_add() to add a rule to + * the table in the rte_flow_classifier object. + * - application calls rte_flow_classifier_query() in a polling manner, + * preferably after rte_eth_rx_burst(). This will cause the library to + * match packet information to flow information with some measurements. + * - rte_flow_classifier object can be destroyed when it is no longer needed + * with rte_flow_classifier_free() + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int librte_flow_classify_logtype; + +#define RTE_FLOW_CLASSIFY_LOG(level, ...) \ + rte_log(RTE_LOG_ ## level, \ + librte_flow_classify_logtype, \ + RTE_FMT("%s(): " RTE_FMT_HEAD(__VA_ARGS__,), \ + __func__, \ + RTE_FMT_TAIL(__VA_ARGS__,))) + +#ifndef RTE_FLOW_CLASSIFY_TABLE_MAX +#define RTE_FLOW_CLASSIFY_TABLE_MAX 32 +#endif + +/** Opaque data type for flow classifier */ +struct rte_flow_classifier; + +/** Opaque data type for flow classify rule */ +struct rte_flow_classify_rule; + +/** Flow classify rule type */ +enum rte_flow_classify_rule_type { + /** no type */ + RTE_FLOW_CLASSIFY_RULE_TYPE_NONE, + /** IPv4 5tuple type */ + RTE_FLOW_CLASSIFY_RULE_TYPE_IPV4_5TUPLE, +}; + +/** Flow classify table type */ +enum rte_flow_classify_table_type { + /** No type */ + RTE_FLOW_CLASSIFY_TABLE_TYPE_NONE = 1 << 0, + /** ACL IP4 5TUPLE */ + RTE_FLOW_CLASSIFY_TABLE_ACL_IP4_5TUPLE = 1 << 1, + /** ACL VLAN IP4 5TUPLE */ + RTE_FLOW_CLASSIFY_TABLE_ACL_VLAN_IP4_5TUPLE = 1 << 2, + /** ACL QinQ IP4 5TUPLE */ + RTE_FLOW_CLASSIFY_TABLE_ACL_QINQ_IP4_5TUPLE = 1 << 3, + +}; + +/** Parameters for flow classifier creation */ +struct rte_flow_classifier_params { + /** flow classifier name */ + const char *name; + + /** CPU socket ID where memory for the flow classifier and its */ + /** elements (tables) should be allocated */ + int socket_id; +}; + +/** Parameters for table creation */ +struct rte_flow_classify_table_params { + /** Table operations (specific to each table type) */ + struct rte_table_ops *ops; + + /** Opaque param to be passed to the table create operation */ + void *arg_create; + + /** Classifier table type */ + enum rte_flow_classify_table_type type; +}; + +/** IPv4 5-tuple data */ +struct rte_flow_classify_ipv4_5tuple { + uint32_t dst_ip; /**< Destination IP address in big endian. */ + uint32_t dst_ip_mask; /**< Mask of destination IP address. */ + uint32_t src_ip; /**< Source IP address in big endian. */ + uint32_t src_ip_mask; /**< Mask of destination IP address. */ + uint16_t dst_port; /**< Destination port in big endian. */ + uint16_t dst_port_mask; /**< Mask of destination port. */ + uint16_t src_port; /**< Source Port in big endian. */ + uint16_t src_port_mask; /**< Mask of source port. */ + uint8_t proto; /**< L4 protocol. */ + uint8_t proto_mask; /**< Mask of L4 protocol. */ +}; + +/** + * Flow stats + * + * For the count action, stats can be returned by the query API. + * + * Storage for stats is provided by application. + */ +struct rte_flow_classify_stats { + void *stats; +}; + +struct rte_flow_classify_ipv4_5tuple_stats { + /** count of packets that match IPv4 5tuple pattern */ + uint64_t counter1; + /** IPv4 5tuple data */ + struct rte_flow_classify_ipv4_5tuple ipv4_5tuple; +}; + +/** + * Flow classifier create + * + * @param params + * Parameters for flow classifier creation + * @return + * Handle to flow classifier instance on success or NULL otherwise + */ +struct rte_flow_classifier * __rte_experimental +rte_flow_classifier_create(struct rte_flow_classifier_params *params); + +/** + * Flow classifier free + * + * @param cls + * Handle to flow classifier instance + * @return + * 0 on success, error code otherwise + */ +int __rte_experimental +rte_flow_classifier_free(struct rte_flow_classifier *cls); + +/** + * Flow classify table create + * + * @param cls + * Handle to flow classifier instance + * @param params + * Parameters for flow_classify table creation + * @return + * 0 on success, error code otherwise + */ +int __rte_experimental +rte_flow_classify_table_create(struct rte_flow_classifier *cls, + struct rte_flow_classify_table_params *params); + +/** + * Flow classify validate + * + * @param cls + * Handle to flow classifier instance + * @param[in] attr + * Flow rule attributes + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END pattern item). + * @param[out] error + * Perform verbose error reporting if not NULL. Structure + * initialised in case of error only. + * @return + * 0 on success, error code otherwise + */ +int __rte_experimental +rte_flow_classify_validate(struct rte_flow_classifier *cls, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +/** + * Add a flow classify rule to the flow_classifer table. + * + * @param[in] cls + * Flow classifier handle + * @param[in] attr + * Flow rule attributes + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END pattern item). + * @param[out] key_found + * returns 1 if rule present already, 0 otherwise. + * @param[out] error + * Perform verbose error reporting if not NULL. Structure + * initialised in case of error only. + * @return + * A valid handle in case of success, NULL otherwise. + */ +struct rte_flow_classify_rule * __rte_experimental +rte_flow_classify_table_entry_add(struct rte_flow_classifier *cls, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + int *key_found, + struct rte_flow_error *error); + +/** + * Delete a flow classify rule from the flow_classifer table. + * + * @param[in] cls + * Flow classifier handle + * @param[in] rule + * Flow classify rule + * @return + * 0 on success, error code otherwise. + */ +int __rte_experimental +rte_flow_classify_table_entry_delete(struct rte_flow_classifier *cls, + struct rte_flow_classify_rule *rule); + +/** + * Query flow classifier for given rule. + * + * @param[in] cls + * Flow classifier handle + * @param[in] pkts + * Pointer to packets to process + * @param[in] nb_pkts + * Number of packets to process + * @param[in] rule + * Flow classify rule + * @param[in] stats + * Flow classify stats + * + * @return + * 0 on success, error code otherwise. + */ +int __rte_experimental +rte_flow_classifier_query(struct rte_flow_classifier *cls, + struct rte_mbuf **pkts, + const uint16_t nb_pkts, + struct rte_flow_classify_rule *rule, + struct rte_flow_classify_stats *stats); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_FLOW_CLASSIFY_H_ */ diff --git a/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.c b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.c new file mode 100644 index 00000000..f65ceaf7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.c @@ -0,0 +1,535 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include "rte_flow_classify_parse.h" +#include + +struct classify_valid_pattern { + enum rte_flow_item_type *items; + parse_filter_t parse_filter; +}; + +static struct classify_action action; + +/* Pattern for IPv4 5-tuple UDP filter */ +static enum rte_flow_item_type pattern_ntuple_1[] = { + RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_END, +}; + +/* Pattern for IPv4 5-tuple TCP filter */ +static enum rte_flow_item_type pattern_ntuple_2[] = { + RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_TCP, + RTE_FLOW_ITEM_TYPE_END, +}; + +/* Pattern for IPv4 5-tuple SCTP filter */ +static enum rte_flow_item_type pattern_ntuple_3[] = { + RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_SCTP, + RTE_FLOW_ITEM_TYPE_END, +}; + +static int +classify_parse_ntuple_filter(const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_eth_ntuple_filter *filter, + struct rte_flow_error *error); + +static struct classify_valid_pattern classify_supported_patterns[] = { + /* ntuple */ + { pattern_ntuple_1, classify_parse_ntuple_filter }, + { pattern_ntuple_2, classify_parse_ntuple_filter }, + { pattern_ntuple_3, classify_parse_ntuple_filter }, +}; + +struct classify_action * +classify_get_flow_action(void) +{ + return &action; +} + +/* Find the first VOID or non-VOID item pointer */ +const struct rte_flow_item * +classify_find_first_item(const struct rte_flow_item *item, bool is_void) +{ + bool is_find; + + while (item->type != RTE_FLOW_ITEM_TYPE_END) { + if (is_void) + is_find = item->type == RTE_FLOW_ITEM_TYPE_VOID; + else + is_find = item->type != RTE_FLOW_ITEM_TYPE_VOID; + if (is_find) + break; + item++; + } + return item; +} + +/* Skip all VOID items of the pattern */ +void +classify_pattern_skip_void_item(struct rte_flow_item *items, + const struct rte_flow_item *pattern) +{ + uint32_t cpy_count = 0; + const struct rte_flow_item *pb = pattern, *pe = pattern; + + for (;;) { + /* Find a non-void item first */ + pb = classify_find_first_item(pb, false); + if (pb->type == RTE_FLOW_ITEM_TYPE_END) { + pe = pb; + break; + } + + /* Find a void item */ + pe = classify_find_first_item(pb + 1, true); + + cpy_count = pe - pb; + rte_memcpy(items, pb, sizeof(struct rte_flow_item) * cpy_count); + + items += cpy_count; + + if (pe->type == RTE_FLOW_ITEM_TYPE_END) { + pb = pe; + break; + } + + pb = pe + 1; + } + /* Copy the END item. */ + rte_memcpy(items, pe, sizeof(struct rte_flow_item)); +} + +/* Check if the pattern matches a supported item type array */ +static bool +classify_match_pattern(enum rte_flow_item_type *item_array, + struct rte_flow_item *pattern) +{ + struct rte_flow_item *item = pattern; + + while ((*item_array == item->type) && + (*item_array != RTE_FLOW_ITEM_TYPE_END)) { + item_array++; + item++; + } + + return (*item_array == RTE_FLOW_ITEM_TYPE_END && + item->type == RTE_FLOW_ITEM_TYPE_END); +} + +/* Find if there's parse filter function matched */ +parse_filter_t +classify_find_parse_filter_func(struct rte_flow_item *pattern) +{ + parse_filter_t parse_filter = NULL; + uint8_t i = 0; + + for (; i < RTE_DIM(classify_supported_patterns); i++) { + if (classify_match_pattern(classify_supported_patterns[i].items, + pattern)) { + parse_filter = + classify_supported_patterns[i].parse_filter; + break; + } + } + + return parse_filter; +} + +#define FLOW_RULE_MIN_PRIORITY 8 +#define FLOW_RULE_MAX_PRIORITY 0 + +#define NEXT_ITEM_OF_PATTERN(item, pattern, index)\ + do {\ + item = pattern + index;\ + while (item->type == RTE_FLOW_ITEM_TYPE_VOID) {\ + index++;\ + item = pattern + index;\ + } \ + } while (0) + +#define NEXT_ITEM_OF_ACTION(act, actions, index)\ + do {\ + act = actions + index;\ + while (act->type == RTE_FLOW_ACTION_TYPE_VOID) {\ + index++;\ + act = actions + index;\ + } \ + } while (0) + +/** + * Please aware there's an assumption for all the parsers. + * rte_flow_item is using big endian, rte_flow_attr and + * rte_flow_action are using CPU order. + * Because the pattern is used to describe the packets, + * normally the packets should use network order. + */ + +/** + * Parse the rule to see if it is a n-tuple rule. + * And get the n-tuple filter info BTW. + * pattern: + * The first not void item can be ETH or IPV4. + * The second not void item must be IPV4 if the first one is ETH. + * The third not void item must be UDP or TCP. + * The next not void item must be END. + * action: + * The first not void action should be QUEUE. + * The next not void action should be END. + * pattern example: + * ITEM Spec Mask + * ETH NULL NULL + * IPV4 src_addr 192.168.1.20 0xFFFFFFFF + * dst_addr 192.167.3.50 0xFFFFFFFF + * next_proto_id 17 0xFF + * UDP/TCP/ src_port 80 0xFFFF + * SCTP dst_port 80 0xFFFF + * END + * other members in mask and spec should set to 0x00. + * item->last should be NULL. + */ +static int +classify_parse_ntuple_filter(const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_eth_ntuple_filter *filter, + struct rte_flow_error *error) +{ + const struct rte_flow_item *item; + const struct rte_flow_action *act; + const struct rte_flow_item_ipv4 *ipv4_spec; + const struct rte_flow_item_ipv4 *ipv4_mask; + const struct rte_flow_item_tcp *tcp_spec; + const struct rte_flow_item_tcp *tcp_mask; + const struct rte_flow_item_udp *udp_spec; + const struct rte_flow_item_udp *udp_mask; + const struct rte_flow_item_sctp *sctp_spec; + const struct rte_flow_item_sctp *sctp_mask; + const struct rte_flow_action_count *count; + const struct rte_flow_action_mark *mark_spec; + uint32_t index; + + /* parse pattern */ + index = 0; + + /* the first not void item can be MAC or IPv4 */ + NEXT_ITEM_OF_PATTERN(item, pattern, index); + + if (item->type != RTE_FLOW_ITEM_TYPE_ETH && + item->type != RTE_FLOW_ITEM_TYPE_IPV4) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + /* Skip Ethernet */ + if (item->type == RTE_FLOW_ITEM_TYPE_ETH) { + /*Not supported last point for range*/ + if (item->last) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + item, + "Not supported last point for range"); + return -EINVAL; + + } + /* if the first item is MAC, the content should be NULL */ + if (item->spec || item->mask) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "Not supported by ntuple filter"); + return -EINVAL; + } + /* check if the next not void item is IPv4 */ + index++; + NEXT_ITEM_OF_PATTERN(item, pattern, index); + if (item->type != RTE_FLOW_ITEM_TYPE_IPV4) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "Not supported by ntuple filter"); + return -EINVAL; + } + } + + /* get the IPv4 info */ + if (!item->spec || !item->mask) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Invalid ntuple mask"); + return -EINVAL; + } + /*Not supported last point for range*/ + if (item->last) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + item, "Not supported last point for range"); + return -EINVAL; + + } + + ipv4_mask = item->mask; + /** + * Only support src & dst addresses, protocol, + * others should be masked. + */ + if (ipv4_mask->hdr.version_ihl || + ipv4_mask->hdr.type_of_service || + ipv4_mask->hdr.total_length || + ipv4_mask->hdr.packet_id || + ipv4_mask->hdr.fragment_offset || + ipv4_mask->hdr.time_to_live || + ipv4_mask->hdr.hdr_checksum) { + rte_flow_error_set(error, + EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + filter->dst_ip_mask = ipv4_mask->hdr.dst_addr; + filter->src_ip_mask = ipv4_mask->hdr.src_addr; + filter->proto_mask = ipv4_mask->hdr.next_proto_id; + + ipv4_spec = item->spec; + filter->dst_ip = ipv4_spec->hdr.dst_addr; + filter->src_ip = ipv4_spec->hdr.src_addr; + filter->proto = ipv4_spec->hdr.next_proto_id; + + /* check if the next not void item is TCP or UDP or SCTP */ + index++; + NEXT_ITEM_OF_PATTERN(item, pattern, index); + if (item->type != RTE_FLOW_ITEM_TYPE_TCP && + item->type != RTE_FLOW_ITEM_TYPE_UDP && + item->type != RTE_FLOW_ITEM_TYPE_SCTP) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + /* get the TCP/UDP info */ + if (!item->spec || !item->mask) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Invalid ntuple mask"); + return -EINVAL; + } + + /*Not supported last point for range*/ + if (item->last) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + item, "Not supported last point for range"); + return -EINVAL; + + } + + if (item->type == RTE_FLOW_ITEM_TYPE_TCP) { + tcp_mask = item->mask; + + /** + * Only support src & dst ports, tcp flags, + * others should be masked. + */ + if (tcp_mask->hdr.sent_seq || + tcp_mask->hdr.recv_ack || + tcp_mask->hdr.data_off || + tcp_mask->hdr.rx_win || + tcp_mask->hdr.cksum || + tcp_mask->hdr.tcp_urp) { + memset(filter, 0, + sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + filter->dst_port_mask = tcp_mask->hdr.dst_port; + filter->src_port_mask = tcp_mask->hdr.src_port; + if (tcp_mask->hdr.tcp_flags == 0xFF) { + filter->flags |= RTE_NTUPLE_FLAGS_TCP_FLAG; + } else if (!tcp_mask->hdr.tcp_flags) { + filter->flags &= ~RTE_NTUPLE_FLAGS_TCP_FLAG; + } else { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + tcp_spec = item->spec; + filter->dst_port = tcp_spec->hdr.dst_port; + filter->src_port = tcp_spec->hdr.src_port; + filter->tcp_flags = tcp_spec->hdr.tcp_flags; + } else if (item->type == RTE_FLOW_ITEM_TYPE_UDP) { + udp_mask = item->mask; + + /** + * Only support src & dst ports, + * others should be masked. + */ + if (udp_mask->hdr.dgram_len || + udp_mask->hdr.dgram_cksum) { + memset(filter, 0, + sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + filter->dst_port_mask = udp_mask->hdr.dst_port; + filter->src_port_mask = udp_mask->hdr.src_port; + + udp_spec = item->spec; + filter->dst_port = udp_spec->hdr.dst_port; + filter->src_port = udp_spec->hdr.src_port; + } else { + sctp_mask = item->mask; + + /** + * Only support src & dst ports, + * others should be masked. + */ + if (sctp_mask->hdr.tag || + sctp_mask->hdr.cksum) { + memset(filter, 0, + sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + filter->dst_port_mask = sctp_mask->hdr.dst_port; + filter->src_port_mask = sctp_mask->hdr.src_port; + + sctp_spec = item->spec; + filter->dst_port = sctp_spec->hdr.dst_port; + filter->src_port = sctp_spec->hdr.src_port; + } + + /* check if the next not void item is END */ + index++; + NEXT_ITEM_OF_PATTERN(item, pattern, index); + if (item->type != RTE_FLOW_ITEM_TYPE_END) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, "Not supported by ntuple filter"); + return -EINVAL; + } + + table_type = RTE_FLOW_CLASSIFY_TABLE_ACL_IP4_5TUPLE; + + /* parse attr */ + /* must be input direction */ + if (!attr->ingress) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "Only support ingress."); + return -EINVAL; + } + + /* not supported */ + if (attr->egress) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + attr, "Not support egress."); + return -EINVAL; + } + + if (attr->priority > 0xFFFF) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + attr, "Error priority."); + return -EINVAL; + } + filter->priority = (uint16_t)attr->priority; + if (attr->priority > FLOW_RULE_MIN_PRIORITY) + filter->priority = FLOW_RULE_MAX_PRIORITY; + + /* parse action */ + index = 0; + + /** + * n-tuple only supports count and Mark, + * check if the first not void action is COUNT or MARK. + */ + memset(&action, 0, sizeof(action)); + NEXT_ITEM_OF_ACTION(act, actions, index); + switch (act->type) { + case RTE_FLOW_ACTION_TYPE_COUNT: + action.action_mask |= 1LLU << RTE_FLOW_ACTION_TYPE_COUNT; + count = act->conf; + memcpy(&action.act.counter, count, sizeof(action.act.counter)); + break; + case RTE_FLOW_ACTION_TYPE_MARK: + action.action_mask |= 1LLU << RTE_FLOW_ACTION_TYPE_MARK; + mark_spec = act->conf; + memcpy(&action.act.mark, mark_spec, sizeof(action.act.mark)); + break; + default: + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, act, + "Invalid action."); + return -EINVAL; + } + + /* check if the next not void item is MARK or COUNT or END */ + index++; + NEXT_ITEM_OF_ACTION(act, actions, index); + switch (act->type) { + case RTE_FLOW_ACTION_TYPE_COUNT: + action.action_mask |= 1LLU << RTE_FLOW_ACTION_TYPE_COUNT; + count = act->conf; + memcpy(&action.act.counter, count, sizeof(action.act.counter)); + break; + case RTE_FLOW_ACTION_TYPE_MARK: + action.action_mask |= 1LLU << RTE_FLOW_ACTION_TYPE_MARK; + mark_spec = act->conf; + memcpy(&action.act.mark, mark_spec, sizeof(action.act.mark)); + break; + case RTE_FLOW_ACTION_TYPE_END: + return 0; + default: + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, act, + "Invalid action."); + return -EINVAL; + } + + /* check if the next not void item is END */ + index++; + NEXT_ITEM_OF_ACTION(act, actions, index); + if (act->type != RTE_FLOW_ACTION_TYPE_END) { + memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, act, + "Invalid action."); + return -EINVAL; + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.h b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.h new file mode 100644 index 00000000..365a07bd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_parse.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_FLOW_CLASSIFY_PARSE_H_ +#define _RTE_FLOW_CLASSIFY_PARSE_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern enum rte_flow_classify_table_type table_type; + +struct classify_action { + /* Flow action mask */ + uint64_t action_mask; + + struct action { + /** Integer value to return with packets */ + struct rte_flow_action_mark mark; + /** Flow rule counter */ + struct rte_flow_query_count counter; + } act; +}; + +typedef int (*parse_filter_t)(const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_eth_ntuple_filter *filter, + struct rte_flow_error *error); + +/* Skip all VOID items of the pattern */ +void +classify_pattern_skip_void_item(struct rte_flow_item *items, + const struct rte_flow_item *pattern); + +/* Find the first VOID or non-VOID item pointer */ +const struct rte_flow_item * +classify_find_first_item(const struct rte_flow_item *item, bool is_void); + + +/* Find if there's parse filter function matched */ +parse_filter_t +classify_find_parse_filter_func(struct rte_flow_item *pattern); + +/* get action data */ +struct classify_action * +classify_get_flow_action(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_FLOW_CLASSIFY_PARSE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_version.map b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_version.map new file mode 100644 index 00000000..49bc25c6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_flow_classify/rte_flow_classify_version.map @@ -0,0 +1,13 @@ +EXPERIMENTAL { + global: + + rte_flow_classifier_create; + rte_flow_classifier_free; + rte_flow_classifier_query; + rte_flow_classify_table_create; + rte_flow_classify_table_entry_add; + rte_flow_classify_table_entry_delete; + rte_flow_classify_validate; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_gro/Makefile b/src/spdk/dpdk/lib/librte_gro/Makefile new file mode 100644 index 00000000..bec248f9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_gro.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal -lrte_mbuf -lrte_ethdev -lrte_net + +EXPORT_MAP := rte_gro_version.map + +LIBABIVER := 1 + +# source files +SRCS-$(CONFIG_RTE_LIBRTE_GRO) += rte_gro.c +SRCS-$(CONFIG_RTE_LIBRTE_GRO) += gro_tcp4.c +SRCS-$(CONFIG_RTE_LIBRTE_GRO) += gro_vxlan_tcp4.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_GRO)-include += rte_gro.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_gro/gro_tcp4.c b/src/spdk/dpdk/lib/librte_gro/gro_tcp4.c new file mode 100644 index 00000000..2c0f35c6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/gro_tcp4.c @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include +#include + +#include "gro_tcp4.h" + +void * +gro_tcp4_tbl_create(uint16_t socket_id, + uint16_t max_flow_num, + uint16_t max_item_per_flow) +{ + struct gro_tcp4_tbl *tbl; + size_t size; + uint32_t entries_num, i; + + entries_num = max_flow_num * max_item_per_flow; + entries_num = RTE_MIN(entries_num, GRO_TCP4_TBL_MAX_ITEM_NUM); + + if (entries_num == 0) + return NULL; + + tbl = rte_zmalloc_socket(__func__, + sizeof(struct gro_tcp4_tbl), + RTE_CACHE_LINE_SIZE, + socket_id); + if (tbl == NULL) + return NULL; + + size = sizeof(struct gro_tcp4_item) * entries_num; + tbl->items = rte_zmalloc_socket(__func__, + size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (tbl->items == NULL) { + rte_free(tbl); + return NULL; + } + tbl->max_item_num = entries_num; + + size = sizeof(struct gro_tcp4_flow) * entries_num; + tbl->flows = rte_zmalloc_socket(__func__, + size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (tbl->flows == NULL) { + rte_free(tbl->items); + rte_free(tbl); + return NULL; + } + /* INVALID_ARRAY_INDEX indicates an empty flow */ + for (i = 0; i < entries_num; i++) + tbl->flows[i].start_index = INVALID_ARRAY_INDEX; + tbl->max_flow_num = entries_num; + + return tbl; +} + +void +gro_tcp4_tbl_destroy(void *tbl) +{ + struct gro_tcp4_tbl *tcp_tbl = tbl; + + if (tcp_tbl) { + rte_free(tcp_tbl->items); + rte_free(tcp_tbl->flows); + } + rte_free(tcp_tbl); +} + +static inline uint32_t +find_an_empty_item(struct gro_tcp4_tbl *tbl) +{ + uint32_t i; + uint32_t max_item_num = tbl->max_item_num; + + for (i = 0; i < max_item_num; i++) + if (tbl->items[i].firstseg == NULL) + return i; + return INVALID_ARRAY_INDEX; +} + +static inline uint32_t +find_an_empty_flow(struct gro_tcp4_tbl *tbl) +{ + uint32_t i; + uint32_t max_flow_num = tbl->max_flow_num; + + for (i = 0; i < max_flow_num; i++) + if (tbl->flows[i].start_index == INVALID_ARRAY_INDEX) + return i; + return INVALID_ARRAY_INDEX; +} + +static inline uint32_t +insert_new_item(struct gro_tcp4_tbl *tbl, + struct rte_mbuf *pkt, + uint64_t start_time, + uint32_t prev_idx, + uint32_t sent_seq, + uint16_t ip_id, + uint8_t is_atomic) +{ + uint32_t item_idx; + + item_idx = find_an_empty_item(tbl); + if (item_idx == INVALID_ARRAY_INDEX) + return INVALID_ARRAY_INDEX; + + tbl->items[item_idx].firstseg = pkt; + tbl->items[item_idx].lastseg = rte_pktmbuf_lastseg(pkt); + tbl->items[item_idx].start_time = start_time; + tbl->items[item_idx].next_pkt_idx = INVALID_ARRAY_INDEX; + tbl->items[item_idx].sent_seq = sent_seq; + tbl->items[item_idx].ip_id = ip_id; + tbl->items[item_idx].nb_merged = 1; + tbl->items[item_idx].is_atomic = is_atomic; + tbl->item_num++; + + /* if the previous packet exists, chain them together. */ + if (prev_idx != INVALID_ARRAY_INDEX) { + tbl->items[item_idx].next_pkt_idx = + tbl->items[prev_idx].next_pkt_idx; + tbl->items[prev_idx].next_pkt_idx = item_idx; + } + + return item_idx; +} + +static inline uint32_t +delete_item(struct gro_tcp4_tbl *tbl, uint32_t item_idx, + uint32_t prev_item_idx) +{ + uint32_t next_idx = tbl->items[item_idx].next_pkt_idx; + + /* NULL indicates an empty item */ + tbl->items[item_idx].firstseg = NULL; + tbl->item_num--; + if (prev_item_idx != INVALID_ARRAY_INDEX) + tbl->items[prev_item_idx].next_pkt_idx = next_idx; + + return next_idx; +} + +static inline uint32_t +insert_new_flow(struct gro_tcp4_tbl *tbl, + struct tcp4_flow_key *src, + uint32_t item_idx) +{ + struct tcp4_flow_key *dst; + uint32_t flow_idx; + + flow_idx = find_an_empty_flow(tbl); + if (unlikely(flow_idx == INVALID_ARRAY_INDEX)) + return INVALID_ARRAY_INDEX; + + dst = &(tbl->flows[flow_idx].key); + + ether_addr_copy(&(src->eth_saddr), &(dst->eth_saddr)); + ether_addr_copy(&(src->eth_daddr), &(dst->eth_daddr)); + dst->ip_src_addr = src->ip_src_addr; + dst->ip_dst_addr = src->ip_dst_addr; + dst->recv_ack = src->recv_ack; + dst->src_port = src->src_port; + dst->dst_port = src->dst_port; + + tbl->flows[flow_idx].start_index = item_idx; + tbl->flow_num++; + + return flow_idx; +} + +/* + * update the packet length for the flushed packet. + */ +static inline void +update_header(struct gro_tcp4_item *item) +{ + struct ipv4_hdr *ipv4_hdr; + struct rte_mbuf *pkt = item->firstseg; + + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + pkt->l2_len); + ipv4_hdr->total_length = rte_cpu_to_be_16(pkt->pkt_len - + pkt->l2_len); +} + +int32_t +gro_tcp4_reassemble(struct rte_mbuf *pkt, + struct gro_tcp4_tbl *tbl, + uint64_t start_time) +{ + struct ether_hdr *eth_hdr; + struct ipv4_hdr *ipv4_hdr; + struct tcp_hdr *tcp_hdr; + uint32_t sent_seq; + uint16_t tcp_dl, ip_id, hdr_len, frag_off; + uint8_t is_atomic; + + struct tcp4_flow_key key; + uint32_t cur_idx, prev_idx, item_idx; + uint32_t i, max_flow_num, remaining_flow_num; + int cmp; + uint8_t find; + + eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len); + tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len); + hdr_len = pkt->l2_len + pkt->l3_len + pkt->l4_len; + + /* + * Don't process the packet which has FIN, SYN, RST, PSH, URG, ECE + * or CWR set. + */ + if (tcp_hdr->tcp_flags != TCP_ACK_FLAG) + return -1; + /* + * Don't process the packet whose payload length is less than or + * equal to 0. + */ + tcp_dl = pkt->pkt_len - hdr_len; + if (tcp_dl <= 0) + return -1; + + /* + * Save IPv4 ID for the packet whose DF bit is 0. For the packet + * whose DF bit is 1, IPv4 ID is ignored. + */ + frag_off = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); + is_atomic = (frag_off & IPV4_HDR_DF_FLAG) == IPV4_HDR_DF_FLAG; + ip_id = is_atomic ? 0 : rte_be_to_cpu_16(ipv4_hdr->packet_id); + sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq); + + ether_addr_copy(&(eth_hdr->s_addr), &(key.eth_saddr)); + ether_addr_copy(&(eth_hdr->d_addr), &(key.eth_daddr)); + key.ip_src_addr = ipv4_hdr->src_addr; + key.ip_dst_addr = ipv4_hdr->dst_addr; + key.src_port = tcp_hdr->src_port; + key.dst_port = tcp_hdr->dst_port; + key.recv_ack = tcp_hdr->recv_ack; + + /* Search for a matched flow. */ + max_flow_num = tbl->max_flow_num; + remaining_flow_num = tbl->flow_num; + find = 0; + for (i = 0; i < max_flow_num && remaining_flow_num; i++) { + if (tbl->flows[i].start_index != INVALID_ARRAY_INDEX) { + if (is_same_tcp4_flow(tbl->flows[i].key, key)) { + find = 1; + break; + } + remaining_flow_num--; + } + } + + /* + * Fail to find a matched flow. Insert a new flow and store the + * packet into the flow. + */ + if (find == 0) { + item_idx = insert_new_item(tbl, pkt, start_time, + INVALID_ARRAY_INDEX, sent_seq, ip_id, + is_atomic); + if (item_idx == INVALID_ARRAY_INDEX) + return -1; + if (insert_new_flow(tbl, &key, item_idx) == + INVALID_ARRAY_INDEX) { + /* + * Fail to insert a new flow, so delete the + * stored packet. + */ + delete_item(tbl, item_idx, INVALID_ARRAY_INDEX); + return -1; + } + return 0; + } + + /* + * Check all packets in the flow and try to find a neighbor for + * the input packet. + */ + cur_idx = tbl->flows[i].start_index; + prev_idx = cur_idx; + do { + cmp = check_seq_option(&(tbl->items[cur_idx]), tcp_hdr, + sent_seq, ip_id, pkt->l4_len, tcp_dl, 0, + is_atomic); + if (cmp) { + if (merge_two_tcp4_packets(&(tbl->items[cur_idx]), + pkt, cmp, sent_seq, ip_id, 0)) + return 1; + /* + * Fail to merge the two packets, as the packet + * length is greater than the max value. Store + * the packet into the flow. + */ + if (insert_new_item(tbl, pkt, start_time, prev_idx, + sent_seq, ip_id, is_atomic) == + INVALID_ARRAY_INDEX) + return -1; + return 0; + } + prev_idx = cur_idx; + cur_idx = tbl->items[cur_idx].next_pkt_idx; + } while (cur_idx != INVALID_ARRAY_INDEX); + + /* Fail to find a neighbor, so store the packet into the flow. */ + if (insert_new_item(tbl, pkt, start_time, prev_idx, sent_seq, + ip_id, is_atomic) == INVALID_ARRAY_INDEX) + return -1; + + return 0; +} + +uint16_t +gro_tcp4_tbl_timeout_flush(struct gro_tcp4_tbl *tbl, + uint64_t flush_timestamp, + struct rte_mbuf **out, + uint16_t nb_out) +{ + uint16_t k = 0; + uint32_t i, j; + uint32_t max_flow_num = tbl->max_flow_num; + + for (i = 0; i < max_flow_num; i++) { + if (unlikely(tbl->flow_num == 0)) + return k; + + j = tbl->flows[i].start_index; + while (j != INVALID_ARRAY_INDEX) { + if (tbl->items[j].start_time <= flush_timestamp) { + out[k++] = tbl->items[j].firstseg; + if (tbl->items[j].nb_merged > 1) + update_header(&(tbl->items[j])); + /* + * Delete the packet and get the next + * packet in the flow. + */ + j = delete_item(tbl, j, INVALID_ARRAY_INDEX); + tbl->flows[i].start_index = j; + if (j == INVALID_ARRAY_INDEX) + tbl->flow_num--; + + if (unlikely(k == nb_out)) + return k; + } else + /* + * The left packets in this flow won't be + * timeout. Go to check other flows. + */ + break; + } + } + return k; +} + +uint32_t +gro_tcp4_tbl_pkt_count(void *tbl) +{ + struct gro_tcp4_tbl *gro_tbl = tbl; + + if (gro_tbl) + return gro_tbl->item_num; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_gro/gro_tcp4.h b/src/spdk/dpdk/lib/librte_gro/gro_tcp4.h new file mode 100644 index 00000000..6bb30cdb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/gro_tcp4.h @@ -0,0 +1,301 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _GRO_TCP4_H_ +#define _GRO_TCP4_H_ + +#include +#include + +#define INVALID_ARRAY_INDEX 0xffffffffUL +#define GRO_TCP4_TBL_MAX_ITEM_NUM (1024UL * 1024UL) + +/* + * The max length of a IPv4 packet, which includes the length of the L3 + * header, the L4 header and the data payload. + */ +#define MAX_IPV4_PKT_LENGTH UINT16_MAX + +/* Header fields representing a TCP/IPv4 flow */ +struct tcp4_flow_key { + struct ether_addr eth_saddr; + struct ether_addr eth_daddr; + uint32_t ip_src_addr; + uint32_t ip_dst_addr; + + uint32_t recv_ack; + uint16_t src_port; + uint16_t dst_port; +}; + +struct gro_tcp4_flow { + struct tcp4_flow_key key; + /* + * The index of the first packet in the flow. + * INVALID_ARRAY_INDEX indicates an empty flow. + */ + uint32_t start_index; +}; + +struct gro_tcp4_item { + /* + * The first MBUF segment of the packet. If the value + * is NULL, it means the item is empty. + */ + struct rte_mbuf *firstseg; + /* The last MBUF segment of the packet */ + struct rte_mbuf *lastseg; + /* + * The time when the first packet is inserted into the table. + * This value won't be updated, even if the packet is merged + * with other packets. + */ + uint64_t start_time; + /* + * next_pkt_idx is used to chain the packets that + * are in the same flow but can't be merged together + * (e.g. caused by packet reordering). + */ + uint32_t next_pkt_idx; + /* TCP sequence number of the packet */ + uint32_t sent_seq; + /* IPv4 ID of the packet */ + uint16_t ip_id; + /* the number of merged packets */ + uint16_t nb_merged; + /* Indicate if IPv4 ID can be ignored */ + uint8_t is_atomic; +}; + +/* + * TCP/IPv4 reassembly table structure. + */ +struct gro_tcp4_tbl { + /* item array */ + struct gro_tcp4_item *items; + /* flow array */ + struct gro_tcp4_flow *flows; + /* current item number */ + uint32_t item_num; + /* current flow num */ + uint32_t flow_num; + /* item array size */ + uint32_t max_item_num; + /* flow array size */ + uint32_t max_flow_num; +}; + +/** + * This function creates a TCP/IPv4 reassembly table. + * + * @param socket_id + * Socket index for allocating the TCP/IPv4 reassemble table + * @param max_flow_num + * The maximum number of flows in the TCP/IPv4 GRO table + * @param max_item_per_flow + * The maximum number of packets per flow + * + * @return + * - Return the table pointer on success. + * - Return NULL on failure. + */ +void *gro_tcp4_tbl_create(uint16_t socket_id, + uint16_t max_flow_num, + uint16_t max_item_per_flow); + +/** + * This function destroys a TCP/IPv4 reassembly table. + * + * @param tbl + * Pointer pointing to the TCP/IPv4 reassembly table. + */ +void gro_tcp4_tbl_destroy(void *tbl); + +/** + * This function merges a TCP/IPv4 packet. It doesn't process the packet, + * which has SYN, FIN, RST, PSH, CWR, ECE or URG set, or doesn't have + * payload. + * + * This function doesn't check if the packet has correct checksums and + * doesn't re-calculate checksums for the merged packet. Additionally, + * it assumes the packets are complete (i.e., MF==0 && frag_off==0), + * when IP fragmentation is possible (i.e., DF==0). It returns the + * packet, if the packet has invalid parameters (e.g. SYN bit is set) + * or there is no available space in the table. + * + * @param pkt + * Packet to reassemble + * @param tbl + * Pointer pointing to the TCP/IPv4 reassembly table + * @start_time + * The time when the packet is inserted into the table + * + * @return + * - Return a positive value if the packet is merged. + * - Return zero if the packet isn't merged but stored in the table. + * - Return a negative value for invalid parameters or no available + * space in the table. + */ +int32_t gro_tcp4_reassemble(struct rte_mbuf *pkt, + struct gro_tcp4_tbl *tbl, + uint64_t start_time); + +/** + * This function flushes timeout packets in a TCP/IPv4 reassembly table, + * and without updating checksums. + * + * @param tbl + * TCP/IPv4 reassembly table pointer + * @param flush_timestamp + * Flush packets which are inserted into the table before or at the + * flush_timestamp. + * @param out + * Pointer array used to keep flushed packets + * @param nb_out + * The element number in 'out'. It also determines the maximum number of + * packets that can be flushed finally. + * + * @return + * The number of flushed packets + */ +uint16_t gro_tcp4_tbl_timeout_flush(struct gro_tcp4_tbl *tbl, + uint64_t flush_timestamp, + struct rte_mbuf **out, + uint16_t nb_out); + +/** + * This function returns the number of the packets in a TCP/IPv4 + * reassembly table. + * + * @param tbl + * TCP/IPv4 reassembly table pointer + * + * @return + * The number of packets in the table + */ +uint32_t gro_tcp4_tbl_pkt_count(void *tbl); + +/* + * Check if two TCP/IPv4 packets belong to the same flow. + */ +static inline int +is_same_tcp4_flow(struct tcp4_flow_key k1, struct tcp4_flow_key k2) +{ + return (is_same_ether_addr(&k1.eth_saddr, &k2.eth_saddr) && + is_same_ether_addr(&k1.eth_daddr, &k2.eth_daddr) && + (k1.ip_src_addr == k2.ip_src_addr) && + (k1.ip_dst_addr == k2.ip_dst_addr) && + (k1.recv_ack == k2.recv_ack) && + (k1.src_port == k2.src_port) && + (k1.dst_port == k2.dst_port)); +} + +/* + * Merge two TCP/IPv4 packets without updating checksums. + * If cmp is larger than 0, append the new packet to the + * original packet. Otherwise, pre-pend the new packet to + * the original packet. + */ +static inline int +merge_two_tcp4_packets(struct gro_tcp4_item *item, + struct rte_mbuf *pkt, + int cmp, + uint32_t sent_seq, + uint16_t ip_id, + uint16_t l2_offset) +{ + struct rte_mbuf *pkt_head, *pkt_tail, *lastseg; + uint16_t hdr_len, l2_len; + + if (cmp > 0) { + pkt_head = item->firstseg; + pkt_tail = pkt; + } else { + pkt_head = pkt; + pkt_tail = item->firstseg; + } + + /* check if the IPv4 packet length is greater than the max value */ + hdr_len = l2_offset + pkt_head->l2_len + pkt_head->l3_len + + pkt_head->l4_len; + l2_len = l2_offset > 0 ? pkt_head->outer_l2_len : pkt_head->l2_len; + if (unlikely(pkt_head->pkt_len - l2_len + pkt_tail->pkt_len - + hdr_len > MAX_IPV4_PKT_LENGTH)) + return 0; + + /* remove the packet header for the tail packet */ + rte_pktmbuf_adj(pkt_tail, hdr_len); + + /* chain two packets together */ + if (cmp > 0) { + item->lastseg->next = pkt; + item->lastseg = rte_pktmbuf_lastseg(pkt); + /* update IP ID to the larger value */ + item->ip_id = ip_id; + } else { + lastseg = rte_pktmbuf_lastseg(pkt); + lastseg->next = item->firstseg; + item->firstseg = pkt; + /* update sent_seq to the smaller value */ + item->sent_seq = sent_seq; + item->ip_id = ip_id; + } + item->nb_merged++; + + /* update MBUF metadata for the merged packet */ + pkt_head->nb_segs += pkt_tail->nb_segs; + pkt_head->pkt_len += pkt_tail->pkt_len; + + return 1; +} + +/* + * Check if two TCP/IPv4 packets are neighbors. + */ +static inline int +check_seq_option(struct gro_tcp4_item *item, + struct tcp_hdr *tcph, + uint32_t sent_seq, + uint16_t ip_id, + uint16_t tcp_hl, + uint16_t tcp_dl, + uint16_t l2_offset, + uint8_t is_atomic) +{ + struct rte_mbuf *pkt_orig = item->firstseg; + struct ipv4_hdr *iph_orig; + struct tcp_hdr *tcph_orig; + uint16_t len, tcp_hl_orig; + + iph_orig = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt_orig, char *) + + l2_offset + pkt_orig->l2_len); + tcph_orig = (struct tcp_hdr *)((char *)iph_orig + pkt_orig->l3_len); + tcp_hl_orig = pkt_orig->l4_len; + + /* Check if TCP option fields equal */ + len = RTE_MAX(tcp_hl, tcp_hl_orig) - sizeof(struct tcp_hdr); + if ((tcp_hl != tcp_hl_orig) || ((len > 0) && + (memcmp(tcph + 1, tcph_orig + 1, + len) != 0))) + return 0; + + /* Don't merge packets whose DF bits are different */ + if (unlikely(item->is_atomic ^ is_atomic)) + return 0; + + /* check if the two packets are neighbors */ + len = pkt_orig->pkt_len - l2_offset - pkt_orig->l2_len - + pkt_orig->l3_len - tcp_hl_orig; + if ((sent_seq == item->sent_seq + len) && (is_atomic || + (ip_id == item->ip_id + 1))) + /* append the new packet */ + return 1; + else if ((sent_seq + tcp_dl == item->sent_seq) && (is_atomic || + (ip_id + item->nb_merged == item->ip_id))) + /* pre-pend the new packet */ + return -1; + + return 0; +} +#endif diff --git a/src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.c b/src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.c new file mode 100644 index 00000000..ca86f010 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.c @@ -0,0 +1,494 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include "gro_vxlan_tcp4.h" + +void * +gro_vxlan_tcp4_tbl_create(uint16_t socket_id, + uint16_t max_flow_num, + uint16_t max_item_per_flow) +{ + struct gro_vxlan_tcp4_tbl *tbl; + size_t size; + uint32_t entries_num, i; + + entries_num = max_flow_num * max_item_per_flow; + entries_num = RTE_MIN(entries_num, GRO_VXLAN_TCP4_TBL_MAX_ITEM_NUM); + + if (entries_num == 0) + return NULL; + + tbl = rte_zmalloc_socket(__func__, + sizeof(struct gro_vxlan_tcp4_tbl), + RTE_CACHE_LINE_SIZE, + socket_id); + if (tbl == NULL) + return NULL; + + size = sizeof(struct gro_vxlan_tcp4_item) * entries_num; + tbl->items = rte_zmalloc_socket(__func__, + size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (tbl->items == NULL) { + rte_free(tbl); + return NULL; + } + tbl->max_item_num = entries_num; + + size = sizeof(struct gro_vxlan_tcp4_flow) * entries_num; + tbl->flows = rte_zmalloc_socket(__func__, + size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (tbl->flows == NULL) { + rte_free(tbl->items); + rte_free(tbl); + return NULL; + } + + for (i = 0; i < entries_num; i++) + tbl->flows[i].start_index = INVALID_ARRAY_INDEX; + tbl->max_flow_num = entries_num; + + return tbl; +} + +void +gro_vxlan_tcp4_tbl_destroy(void *tbl) +{ + struct gro_vxlan_tcp4_tbl *vxlan_tbl = tbl; + + if (vxlan_tbl) { + rte_free(vxlan_tbl->items); + rte_free(vxlan_tbl->flows); + } + rte_free(vxlan_tbl); +} + +static inline uint32_t +find_an_empty_item(struct gro_vxlan_tcp4_tbl *tbl) +{ + uint32_t max_item_num = tbl->max_item_num, i; + + for (i = 0; i < max_item_num; i++) + if (tbl->items[i].inner_item.firstseg == NULL) + return i; + return INVALID_ARRAY_INDEX; +} + +static inline uint32_t +find_an_empty_flow(struct gro_vxlan_tcp4_tbl *tbl) +{ + uint32_t max_flow_num = tbl->max_flow_num, i; + + for (i = 0; i < max_flow_num; i++) + if (tbl->flows[i].start_index == INVALID_ARRAY_INDEX) + return i; + return INVALID_ARRAY_INDEX; +} + +static inline uint32_t +insert_new_item(struct gro_vxlan_tcp4_tbl *tbl, + struct rte_mbuf *pkt, + uint64_t start_time, + uint32_t prev_idx, + uint32_t sent_seq, + uint16_t outer_ip_id, + uint16_t ip_id, + uint8_t outer_is_atomic, + uint8_t is_atomic) +{ + uint32_t item_idx; + + item_idx = find_an_empty_item(tbl); + if (unlikely(item_idx == INVALID_ARRAY_INDEX)) + return INVALID_ARRAY_INDEX; + + tbl->items[item_idx].inner_item.firstseg = pkt; + tbl->items[item_idx].inner_item.lastseg = rte_pktmbuf_lastseg(pkt); + tbl->items[item_idx].inner_item.start_time = start_time; + tbl->items[item_idx].inner_item.next_pkt_idx = INVALID_ARRAY_INDEX; + tbl->items[item_idx].inner_item.sent_seq = sent_seq; + tbl->items[item_idx].inner_item.ip_id = ip_id; + tbl->items[item_idx].inner_item.nb_merged = 1; + tbl->items[item_idx].inner_item.is_atomic = is_atomic; + tbl->items[item_idx].outer_ip_id = outer_ip_id; + tbl->items[item_idx].outer_is_atomic = outer_is_atomic; + tbl->item_num++; + + /* If the previous packet exists, chain the new one with it. */ + if (prev_idx != INVALID_ARRAY_INDEX) { + tbl->items[item_idx].inner_item.next_pkt_idx = + tbl->items[prev_idx].inner_item.next_pkt_idx; + tbl->items[prev_idx].inner_item.next_pkt_idx = item_idx; + } + + return item_idx; +} + +static inline uint32_t +delete_item(struct gro_vxlan_tcp4_tbl *tbl, + uint32_t item_idx, + uint32_t prev_item_idx) +{ + uint32_t next_idx = tbl->items[item_idx].inner_item.next_pkt_idx; + + /* NULL indicates an empty item. */ + tbl->items[item_idx].inner_item.firstseg = NULL; + tbl->item_num--; + if (prev_item_idx != INVALID_ARRAY_INDEX) + tbl->items[prev_item_idx].inner_item.next_pkt_idx = next_idx; + + return next_idx; +} + +static inline uint32_t +insert_new_flow(struct gro_vxlan_tcp4_tbl *tbl, + struct vxlan_tcp4_flow_key *src, + uint32_t item_idx) +{ + struct vxlan_tcp4_flow_key *dst; + uint32_t flow_idx; + + flow_idx = find_an_empty_flow(tbl); + if (unlikely(flow_idx == INVALID_ARRAY_INDEX)) + return INVALID_ARRAY_INDEX; + + dst = &(tbl->flows[flow_idx].key); + + ether_addr_copy(&(src->inner_key.eth_saddr), + &(dst->inner_key.eth_saddr)); + ether_addr_copy(&(src->inner_key.eth_daddr), + &(dst->inner_key.eth_daddr)); + dst->inner_key.ip_src_addr = src->inner_key.ip_src_addr; + dst->inner_key.ip_dst_addr = src->inner_key.ip_dst_addr; + dst->inner_key.recv_ack = src->inner_key.recv_ack; + dst->inner_key.src_port = src->inner_key.src_port; + dst->inner_key.dst_port = src->inner_key.dst_port; + + dst->vxlan_hdr.vx_flags = src->vxlan_hdr.vx_flags; + dst->vxlan_hdr.vx_vni = src->vxlan_hdr.vx_vni; + ether_addr_copy(&(src->outer_eth_saddr), &(dst->outer_eth_saddr)); + ether_addr_copy(&(src->outer_eth_daddr), &(dst->outer_eth_daddr)); + dst->outer_ip_src_addr = src->outer_ip_src_addr; + dst->outer_ip_dst_addr = src->outer_ip_dst_addr; + dst->outer_src_port = src->outer_src_port; + dst->outer_dst_port = src->outer_dst_port; + + tbl->flows[flow_idx].start_index = item_idx; + tbl->flow_num++; + + return flow_idx; +} + +static inline int +is_same_vxlan_tcp4_flow(struct vxlan_tcp4_flow_key k1, + struct vxlan_tcp4_flow_key k2) +{ + return (is_same_ether_addr(&k1.outer_eth_saddr, &k2.outer_eth_saddr) && + is_same_ether_addr(&k1.outer_eth_daddr, + &k2.outer_eth_daddr) && + (k1.outer_ip_src_addr == k2.outer_ip_src_addr) && + (k1.outer_ip_dst_addr == k2.outer_ip_dst_addr) && + (k1.outer_src_port == k2.outer_src_port) && + (k1.outer_dst_port == k2.outer_dst_port) && + (k1.vxlan_hdr.vx_flags == k2.vxlan_hdr.vx_flags) && + (k1.vxlan_hdr.vx_vni == k2.vxlan_hdr.vx_vni) && + is_same_tcp4_flow(k1.inner_key, k2.inner_key)); +} + +static inline int +check_vxlan_seq_option(struct gro_vxlan_tcp4_item *item, + struct tcp_hdr *tcp_hdr, + uint32_t sent_seq, + uint16_t outer_ip_id, + uint16_t ip_id, + uint16_t tcp_hl, + uint16_t tcp_dl, + uint8_t outer_is_atomic, + uint8_t is_atomic) +{ + struct rte_mbuf *pkt = item->inner_item.firstseg; + int cmp; + uint16_t l2_offset; + + /* Don't merge packets whose outer DF bits are different. */ + if (unlikely(item->outer_is_atomic ^ outer_is_atomic)) + return 0; + + l2_offset = pkt->outer_l2_len + pkt->outer_l3_len; + cmp = check_seq_option(&item->inner_item, tcp_hdr, sent_seq, ip_id, + tcp_hl, tcp_dl, l2_offset, is_atomic); + if ((cmp > 0) && (outer_is_atomic || + (outer_ip_id == item->outer_ip_id + 1))) + /* Append the new packet. */ + return 1; + else if ((cmp < 0) && (outer_is_atomic || + (outer_ip_id + item->inner_item.nb_merged == + item->outer_ip_id))) + /* Prepend the new packet. */ + return -1; + + return 0; +} + +static inline int +merge_two_vxlan_tcp4_packets(struct gro_vxlan_tcp4_item *item, + struct rte_mbuf *pkt, + int cmp, + uint32_t sent_seq, + uint16_t outer_ip_id, + uint16_t ip_id) +{ + if (merge_two_tcp4_packets(&item->inner_item, pkt, cmp, sent_seq, + ip_id, pkt->outer_l2_len + + pkt->outer_l3_len)) { + /* Update the outer IPv4 ID to the large value. */ + item->outer_ip_id = cmp > 0 ? outer_ip_id : item->outer_ip_id; + return 1; + } + + return 0; +} + +static inline void +update_vxlan_header(struct gro_vxlan_tcp4_item *item) +{ + struct ipv4_hdr *ipv4_hdr; + struct udp_hdr *udp_hdr; + struct rte_mbuf *pkt = item->inner_item.firstseg; + uint16_t len; + + /* Update the outer IPv4 header. */ + len = pkt->pkt_len - pkt->outer_l2_len; + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + pkt->outer_l2_len); + ipv4_hdr->total_length = rte_cpu_to_be_16(len); + + /* Update the outer UDP header. */ + len -= pkt->outer_l3_len; + udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr + pkt->outer_l3_len); + udp_hdr->dgram_len = rte_cpu_to_be_16(len); + + /* Update the inner IPv4 header. */ + len -= pkt->l2_len; + ipv4_hdr = (struct ipv4_hdr *)((char *)udp_hdr + pkt->l2_len); + ipv4_hdr->total_length = rte_cpu_to_be_16(len); +} + +int32_t +gro_vxlan_tcp4_reassemble(struct rte_mbuf *pkt, + struct gro_vxlan_tcp4_tbl *tbl, + uint64_t start_time) +{ + struct ether_hdr *outer_eth_hdr, *eth_hdr; + struct ipv4_hdr *outer_ipv4_hdr, *ipv4_hdr; + struct tcp_hdr *tcp_hdr; + struct udp_hdr *udp_hdr; + struct vxlan_hdr *vxlan_hdr; + uint32_t sent_seq; + uint16_t tcp_dl, frag_off, outer_ip_id, ip_id; + uint8_t outer_is_atomic, is_atomic; + + struct vxlan_tcp4_flow_key key; + uint32_t cur_idx, prev_idx, item_idx; + uint32_t i, max_flow_num, remaining_flow_num; + int cmp; + uint16_t hdr_len; + uint8_t find; + + outer_eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + outer_ipv4_hdr = (struct ipv4_hdr *)((char *)outer_eth_hdr + + pkt->outer_l2_len); + udp_hdr = (struct udp_hdr *)((char *)outer_ipv4_hdr + + pkt->outer_l3_len); + vxlan_hdr = (struct vxlan_hdr *)((char *)udp_hdr + + sizeof(struct udp_hdr)); + eth_hdr = (struct ether_hdr *)((char *)vxlan_hdr + + sizeof(struct vxlan_hdr)); + ipv4_hdr = (struct ipv4_hdr *)((char *)udp_hdr + pkt->l2_len); + tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len); + + /* + * Don't process the packet which has FIN, SYN, RST, PSH, URG, + * ECE or CWR set. + */ + if (tcp_hdr->tcp_flags != TCP_ACK_FLAG) + return -1; + + hdr_len = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len + + pkt->l3_len + pkt->l4_len; + /* + * Don't process the packet whose payload length is less than or + * equal to 0. + */ + tcp_dl = pkt->pkt_len - hdr_len; + if (tcp_dl <= 0) + return -1; + + /* + * Save IPv4 ID for the packet whose DF bit is 0. For the packet + * whose DF bit is 1, IPv4 ID is ignored. + */ + frag_off = rte_be_to_cpu_16(outer_ipv4_hdr->fragment_offset); + outer_is_atomic = (frag_off & IPV4_HDR_DF_FLAG) == IPV4_HDR_DF_FLAG; + outer_ip_id = outer_is_atomic ? 0 : + rte_be_to_cpu_16(outer_ipv4_hdr->packet_id); + frag_off = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); + is_atomic = (frag_off & IPV4_HDR_DF_FLAG) == IPV4_HDR_DF_FLAG; + ip_id = is_atomic ? 0 : rte_be_to_cpu_16(ipv4_hdr->packet_id); + + sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq); + + ether_addr_copy(&(eth_hdr->s_addr), &(key.inner_key.eth_saddr)); + ether_addr_copy(&(eth_hdr->d_addr), &(key.inner_key.eth_daddr)); + key.inner_key.ip_src_addr = ipv4_hdr->src_addr; + key.inner_key.ip_dst_addr = ipv4_hdr->dst_addr; + key.inner_key.recv_ack = tcp_hdr->recv_ack; + key.inner_key.src_port = tcp_hdr->src_port; + key.inner_key.dst_port = tcp_hdr->dst_port; + + key.vxlan_hdr.vx_flags = vxlan_hdr->vx_flags; + key.vxlan_hdr.vx_vni = vxlan_hdr->vx_vni; + ether_addr_copy(&(outer_eth_hdr->s_addr), &(key.outer_eth_saddr)); + ether_addr_copy(&(outer_eth_hdr->d_addr), &(key.outer_eth_daddr)); + key.outer_ip_src_addr = outer_ipv4_hdr->src_addr; + key.outer_ip_dst_addr = outer_ipv4_hdr->dst_addr; + key.outer_src_port = udp_hdr->src_port; + key.outer_dst_port = udp_hdr->dst_port; + + /* Search for a matched flow. */ + max_flow_num = tbl->max_flow_num; + remaining_flow_num = tbl->flow_num; + find = 0; + for (i = 0; i < max_flow_num && remaining_flow_num; i++) { + if (tbl->flows[i].start_index != INVALID_ARRAY_INDEX) { + if (is_same_vxlan_tcp4_flow(tbl->flows[i].key, key)) { + find = 1; + break; + } + remaining_flow_num--; + } + } + + /* + * Can't find a matched flow. Insert a new flow and store the + * packet into the flow. + */ + if (find == 0) { + item_idx = insert_new_item(tbl, pkt, start_time, + INVALID_ARRAY_INDEX, sent_seq, outer_ip_id, + ip_id, outer_is_atomic, is_atomic); + if (item_idx == INVALID_ARRAY_INDEX) + return -1; + if (insert_new_flow(tbl, &key, item_idx) == + INVALID_ARRAY_INDEX) { + /* + * Fail to insert a new flow, so + * delete the inserted packet. + */ + delete_item(tbl, item_idx, INVALID_ARRAY_INDEX); + return -1; + } + return 0; + } + + /* Check all packets in the flow and try to find a neighbor. */ + cur_idx = tbl->flows[i].start_index; + prev_idx = cur_idx; + do { + cmp = check_vxlan_seq_option(&(tbl->items[cur_idx]), tcp_hdr, + sent_seq, outer_ip_id, ip_id, pkt->l4_len, + tcp_dl, outer_is_atomic, is_atomic); + if (cmp) { + if (merge_two_vxlan_tcp4_packets(&(tbl->items[cur_idx]), + pkt, cmp, sent_seq, + outer_ip_id, ip_id)) + return 1; + /* + * Can't merge two packets, as the packet + * length will be greater than the max value. + * Insert the packet into the flow. + */ + if (insert_new_item(tbl, pkt, start_time, prev_idx, + sent_seq, outer_ip_id, + ip_id, outer_is_atomic, + is_atomic) == + INVALID_ARRAY_INDEX) + return -1; + return 0; + } + prev_idx = cur_idx; + cur_idx = tbl->items[cur_idx].inner_item.next_pkt_idx; + } while (cur_idx != INVALID_ARRAY_INDEX); + + /* Can't find neighbor. Insert the packet into the flow. */ + if (insert_new_item(tbl, pkt, start_time, prev_idx, sent_seq, + outer_ip_id, ip_id, outer_is_atomic, + is_atomic) == INVALID_ARRAY_INDEX) + return -1; + + return 0; +} + +uint16_t +gro_vxlan_tcp4_tbl_timeout_flush(struct gro_vxlan_tcp4_tbl *tbl, + uint64_t flush_timestamp, + struct rte_mbuf **out, + uint16_t nb_out) +{ + uint16_t k = 0; + uint32_t i, j; + uint32_t max_flow_num = tbl->max_flow_num; + + for (i = 0; i < max_flow_num; i++) { + if (unlikely(tbl->flow_num == 0)) + return k; + + j = tbl->flows[i].start_index; + while (j != INVALID_ARRAY_INDEX) { + if (tbl->items[j].inner_item.start_time <= + flush_timestamp) { + out[k++] = tbl->items[j].inner_item.firstseg; + if (tbl->items[j].inner_item.nb_merged > 1) + update_vxlan_header(&(tbl->items[j])); + /* + * Delete the item and get the next packet + * index. + */ + j = delete_item(tbl, j, INVALID_ARRAY_INDEX); + tbl->flows[i].start_index = j; + if (j == INVALID_ARRAY_INDEX) + tbl->flow_num--; + + if (unlikely(k == nb_out)) + return k; + } else + /* + * The left packets in the flow won't be + * timeout. Go to check other flows. + */ + break; + } + } + return k; +} + +uint32_t +gro_vxlan_tcp4_tbl_pkt_count(void *tbl) +{ + struct gro_vxlan_tcp4_tbl *gro_tbl = tbl; + + if (gro_tbl) + return gro_tbl->item_num; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.h b/src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.h new file mode 100644 index 00000000..0cafb921 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/gro_vxlan_tcp4.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _GRO_VXLAN_TCP4_H_ +#define _GRO_VXLAN_TCP4_H_ + +#include "gro_tcp4.h" + +#define GRO_VXLAN_TCP4_TBL_MAX_ITEM_NUM (1024UL * 1024UL) + +/* Header fields representing a VxLAN flow */ +struct vxlan_tcp4_flow_key { + struct tcp4_flow_key inner_key; + struct vxlan_hdr vxlan_hdr; + + struct ether_addr outer_eth_saddr; + struct ether_addr outer_eth_daddr; + + uint32_t outer_ip_src_addr; + uint32_t outer_ip_dst_addr; + + /* Outer UDP ports */ + uint16_t outer_src_port; + uint16_t outer_dst_port; + +}; + +struct gro_vxlan_tcp4_flow { + struct vxlan_tcp4_flow_key key; + /* + * The index of the first packet in the flow. INVALID_ARRAY_INDEX + * indicates an empty flow. + */ + uint32_t start_index; +}; + +struct gro_vxlan_tcp4_item { + struct gro_tcp4_item inner_item; + /* IPv4 ID in the outer IPv4 header */ + uint16_t outer_ip_id; + /* Indicate if outer IPv4 ID can be ignored */ + uint8_t outer_is_atomic; +}; + +/* + * VxLAN (with an outer IPv4 header and an inner TCP/IPv4 packet) + * reassembly table structure + */ +struct gro_vxlan_tcp4_tbl { + /* item array */ + struct gro_vxlan_tcp4_item *items; + /* flow array */ + struct gro_vxlan_tcp4_flow *flows; + /* current item number */ + uint32_t item_num; + /* current flow number */ + uint32_t flow_num; + /* the maximum item number */ + uint32_t max_item_num; + /* the maximum flow number */ + uint32_t max_flow_num; +}; + +/** + * This function creates a VxLAN reassembly table for VxLAN packets + * which have an outer IPv4 header and an inner TCP/IPv4 packet. + * + * @param socket_id + * Socket index for allocating the table + * @param max_flow_num + * The maximum number of flows in the table + * @param max_item_per_flow + * The maximum number of packets per flow + * + * @return + * - Return the table pointer on success. + * - Return NULL on failure. + */ +void *gro_vxlan_tcp4_tbl_create(uint16_t socket_id, + uint16_t max_flow_num, + uint16_t max_item_per_flow); + +/** + * This function destroys a VxLAN reassembly table. + * + * @param tbl + * Pointer pointing to the VxLAN reassembly table + */ +void gro_vxlan_tcp4_tbl_destroy(void *tbl); + +/** + * This function merges a VxLAN packet which has an outer IPv4 header and + * an inner TCP/IPv4 packet. It doesn't process the packet, whose TCP + * header has SYN, FIN, RST, PSH, CWR, ECE or URG bit set, or which + * doesn't have payload. + * + * This function doesn't check if the packet has correct checksums and + * doesn't re-calculate checksums for the merged packet. Additionally, + * it assumes the packets are complete (i.e., MF==0 && frag_off==0), when + * IP fragmentation is possible (i.e., DF==0). It returns the packet, if + * the packet has invalid parameters (e.g. SYN bit is set) or there is no + * available space in the table. + * + * @param pkt + * Packet to reassemble + * @param tbl + * Pointer pointing to the VxLAN reassembly table + * @start_time + * The time when the packet is inserted into the table + * + * @return + * - Return a positive value if the packet is merged. + * - Return zero if the packet isn't merged but stored in the table. + * - Return a negative value for invalid parameters or no available + * space in the table. + */ +int32_t gro_vxlan_tcp4_reassemble(struct rte_mbuf *pkt, + struct gro_vxlan_tcp4_tbl *tbl, + uint64_t start_time); + +/** + * This function flushes timeout packets in the VxLAN reassembly table, + * and without updating checksums. + * + * @param tbl + * Pointer pointing to a VxLAN GRO table + * @param flush_timestamp + * This function flushes packets which are inserted into the table + * before or at the flush_timestamp. + * @param out + * Pointer array used to keep flushed packets + * @param nb_out + * The element number in 'out'. It also determines the maximum number of + * packets that can be flushed finally. + * + * @return + * The number of flushed packets + */ +uint16_t gro_vxlan_tcp4_tbl_timeout_flush(struct gro_vxlan_tcp4_tbl *tbl, + uint64_t flush_timestamp, + struct rte_mbuf **out, + uint16_t nb_out); + +/** + * This function returns the number of the packets in a VxLAN + * reassembly table. + * + * @param tbl + * Pointer pointing to the VxLAN reassembly table + * + * @return + * The number of packets in the table + */ +uint32_t gro_vxlan_tcp4_tbl_pkt_count(void *tbl); +#endif diff --git a/src/spdk/dpdk/lib/librte_gro/meson.build b/src/spdk/dpdk/lib/librte_gro/meson.build new file mode 100644 index 00000000..501668c8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_gro.c', 'gro_tcp4.c', 'gro_vxlan_tcp4.c') +headers = files('rte_gro.h') +deps += ['ethdev'] diff --git a/src/spdk/dpdk/lib/librte_gro/rte_gro.c b/src/spdk/dpdk/lib/librte_gro/rte_gro.c new file mode 100644 index 00000000..6618f4d3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/rte_gro.c @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include +#include + +#include "rte_gro.h" +#include "gro_tcp4.h" +#include "gro_vxlan_tcp4.h" + +typedef void *(*gro_tbl_create_fn)(uint16_t socket_id, + uint16_t max_flow_num, + uint16_t max_item_per_flow); +typedef void (*gro_tbl_destroy_fn)(void *tbl); +typedef uint32_t (*gro_tbl_pkt_count_fn)(void *tbl); + +static gro_tbl_create_fn tbl_create_fn[RTE_GRO_TYPE_MAX_NUM] = { + gro_tcp4_tbl_create, gro_vxlan_tcp4_tbl_create, NULL}; +static gro_tbl_destroy_fn tbl_destroy_fn[RTE_GRO_TYPE_MAX_NUM] = { + gro_tcp4_tbl_destroy, gro_vxlan_tcp4_tbl_destroy, + NULL}; +static gro_tbl_pkt_count_fn tbl_pkt_count_fn[RTE_GRO_TYPE_MAX_NUM] = { + gro_tcp4_tbl_pkt_count, gro_vxlan_tcp4_tbl_pkt_count, + NULL}; + +#define IS_IPV4_TCP_PKT(ptype) (RTE_ETH_IS_IPV4_HDR(ptype) && \ + ((ptype & RTE_PTYPE_L4_TCP) == RTE_PTYPE_L4_TCP)) + +#define IS_IPV4_VXLAN_TCP4_PKT(ptype) (RTE_ETH_IS_IPV4_HDR(ptype) && \ + ((ptype & RTE_PTYPE_L4_UDP) == RTE_PTYPE_L4_UDP) && \ + ((ptype & RTE_PTYPE_TUNNEL_VXLAN) == \ + RTE_PTYPE_TUNNEL_VXLAN) && \ + ((ptype & RTE_PTYPE_INNER_L4_TCP) == \ + RTE_PTYPE_INNER_L4_TCP) && \ + (((ptype & RTE_PTYPE_INNER_L3_MASK) & \ + (RTE_PTYPE_INNER_L3_IPV4 | \ + RTE_PTYPE_INNER_L3_IPV4_EXT | \ + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN)) != 0)) + +/* + * GRO context structure. It keeps the table structures, which are + * used to merge packets, for different GRO types. Before using + * rte_gro_reassemble(), applications need to create the GRO context + * first. + */ +struct gro_ctx { + /* GRO types to perform */ + uint64_t gro_types; + /* reassembly tables */ + void *tbls[RTE_GRO_TYPE_MAX_NUM]; +}; + +void * +rte_gro_ctx_create(const struct rte_gro_param *param) +{ + struct gro_ctx *gro_ctx; + gro_tbl_create_fn create_tbl_fn; + uint64_t gro_type_flag = 0; + uint64_t gro_types = 0; + uint8_t i; + + gro_ctx = rte_zmalloc_socket(__func__, + sizeof(struct gro_ctx), + RTE_CACHE_LINE_SIZE, + param->socket_id); + if (gro_ctx == NULL) + return NULL; + + for (i = 0; i < RTE_GRO_TYPE_MAX_NUM; i++) { + gro_type_flag = 1ULL << i; + if ((param->gro_types & gro_type_flag) == 0) + continue; + + create_tbl_fn = tbl_create_fn[i]; + if (create_tbl_fn == NULL) + continue; + + gro_ctx->tbls[i] = create_tbl_fn(param->socket_id, + param->max_flow_num, + param->max_item_per_flow); + if (gro_ctx->tbls[i] == NULL) { + /* destroy all created tables */ + gro_ctx->gro_types = gro_types; + rte_gro_ctx_destroy(gro_ctx); + return NULL; + } + gro_types |= gro_type_flag; + } + gro_ctx->gro_types = param->gro_types; + + return gro_ctx; +} + +void +rte_gro_ctx_destroy(void *ctx) +{ + gro_tbl_destroy_fn destroy_tbl_fn; + struct gro_ctx *gro_ctx = ctx; + uint64_t gro_type_flag; + uint8_t i; + + for (i = 0; i < RTE_GRO_TYPE_MAX_NUM; i++) { + gro_type_flag = 1ULL << i; + if ((gro_ctx->gro_types & gro_type_flag) == 0) + continue; + destroy_tbl_fn = tbl_destroy_fn[i]; + if (destroy_tbl_fn) + destroy_tbl_fn(gro_ctx->tbls[i]); + } + rte_free(gro_ctx); +} + +uint16_t +rte_gro_reassemble_burst(struct rte_mbuf **pkts, + uint16_t nb_pkts, + const struct rte_gro_param *param) +{ + /* allocate a reassembly table for TCP/IPv4 GRO */ + struct gro_tcp4_tbl tcp_tbl; + struct gro_tcp4_flow tcp_flows[RTE_GRO_MAX_BURST_ITEM_NUM]; + struct gro_tcp4_item tcp_items[RTE_GRO_MAX_BURST_ITEM_NUM] = {{0} }; + + /* Allocate a reassembly table for VXLAN GRO */ + struct gro_vxlan_tcp4_tbl vxlan_tbl; + struct gro_vxlan_tcp4_flow vxlan_flows[RTE_GRO_MAX_BURST_ITEM_NUM]; + struct gro_vxlan_tcp4_item vxlan_items[RTE_GRO_MAX_BURST_ITEM_NUM] = { + {{0}, 0, 0} }; + + struct rte_mbuf *unprocess_pkts[nb_pkts]; + uint32_t item_num; + int32_t ret; + uint16_t i, unprocess_num = 0, nb_after_gro = nb_pkts; + uint8_t do_tcp4_gro = 0, do_vxlan_gro = 0; + + if (unlikely((param->gro_types & (RTE_GRO_IPV4_VXLAN_TCP_IPV4 | + RTE_GRO_TCP_IPV4)) == 0)) + return nb_pkts; + + /* Get the maximum number of packets */ + item_num = RTE_MIN(nb_pkts, (param->max_flow_num * + param->max_item_per_flow)); + item_num = RTE_MIN(item_num, RTE_GRO_MAX_BURST_ITEM_NUM); + + if (param->gro_types & RTE_GRO_IPV4_VXLAN_TCP_IPV4) { + for (i = 0; i < item_num; i++) + vxlan_flows[i].start_index = INVALID_ARRAY_INDEX; + + vxlan_tbl.flows = vxlan_flows; + vxlan_tbl.items = vxlan_items; + vxlan_tbl.flow_num = 0; + vxlan_tbl.item_num = 0; + vxlan_tbl.max_flow_num = item_num; + vxlan_tbl.max_item_num = item_num; + do_vxlan_gro = 1; + } + + if (param->gro_types & RTE_GRO_TCP_IPV4) { + for (i = 0; i < item_num; i++) + tcp_flows[i].start_index = INVALID_ARRAY_INDEX; + + tcp_tbl.flows = tcp_flows; + tcp_tbl.items = tcp_items; + tcp_tbl.flow_num = 0; + tcp_tbl.item_num = 0; + tcp_tbl.max_flow_num = item_num; + tcp_tbl.max_item_num = item_num; + do_tcp4_gro = 1; + } + + for (i = 0; i < nb_pkts; i++) { + /* + * The timestamp is ignored, since all packets + * will be flushed from the tables. + */ + if (IS_IPV4_VXLAN_TCP4_PKT(pkts[i]->packet_type) && + do_vxlan_gro) { + ret = gro_vxlan_tcp4_reassemble(pkts[i], &vxlan_tbl, 0); + if (ret > 0) + /* Merge successfully */ + nb_after_gro--; + else if (ret < 0) + unprocess_pkts[unprocess_num++] = pkts[i]; + } else if (IS_IPV4_TCP_PKT(pkts[i]->packet_type) && + do_tcp4_gro) { + ret = gro_tcp4_reassemble(pkts[i], &tcp_tbl, 0); + if (ret > 0) + /* merge successfully */ + nb_after_gro--; + else if (ret < 0) + unprocess_pkts[unprocess_num++] = pkts[i]; + } else + unprocess_pkts[unprocess_num++] = pkts[i]; + } + + if (nb_after_gro < nb_pkts) { + i = 0; + /* Flush all packets from the tables */ + if (do_vxlan_gro) { + i = gro_vxlan_tcp4_tbl_timeout_flush(&vxlan_tbl, + 0, pkts, nb_pkts); + } + if (do_tcp4_gro) { + i += gro_tcp4_tbl_timeout_flush(&tcp_tbl, 0, + &pkts[i], nb_pkts - i); + } + /* Copy unprocessed packets */ + if (unprocess_num > 0) { + memcpy(&pkts[i], unprocess_pkts, + sizeof(struct rte_mbuf *) * + unprocess_num); + } + } + + return nb_after_gro; +} + +uint16_t +rte_gro_reassemble(struct rte_mbuf **pkts, + uint16_t nb_pkts, + void *ctx) +{ + struct rte_mbuf *unprocess_pkts[nb_pkts]; + struct gro_ctx *gro_ctx = ctx; + void *tcp_tbl, *vxlan_tbl; + uint64_t current_time; + uint16_t i, unprocess_num = 0; + uint8_t do_tcp4_gro, do_vxlan_gro; + + if (unlikely((gro_ctx->gro_types & (RTE_GRO_IPV4_VXLAN_TCP_IPV4 | + RTE_GRO_TCP_IPV4)) == 0)) + return nb_pkts; + + tcp_tbl = gro_ctx->tbls[RTE_GRO_TCP_IPV4_INDEX]; + vxlan_tbl = gro_ctx->tbls[RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX]; + + do_tcp4_gro = (gro_ctx->gro_types & RTE_GRO_TCP_IPV4) == + RTE_GRO_TCP_IPV4; + do_vxlan_gro = (gro_ctx->gro_types & RTE_GRO_IPV4_VXLAN_TCP_IPV4) == + RTE_GRO_IPV4_VXLAN_TCP_IPV4; + + current_time = rte_rdtsc(); + + for (i = 0; i < nb_pkts; i++) { + if (IS_IPV4_VXLAN_TCP4_PKT(pkts[i]->packet_type) && + do_vxlan_gro) { + if (gro_vxlan_tcp4_reassemble(pkts[i], vxlan_tbl, + current_time) < 0) + unprocess_pkts[unprocess_num++] = pkts[i]; + } else if (IS_IPV4_TCP_PKT(pkts[i]->packet_type) && + do_tcp4_gro) { + if (gro_tcp4_reassemble(pkts[i], tcp_tbl, + current_time) < 0) + unprocess_pkts[unprocess_num++] = pkts[i]; + } else + unprocess_pkts[unprocess_num++] = pkts[i]; + } + if (unprocess_num > 0) { + memcpy(pkts, unprocess_pkts, sizeof(struct rte_mbuf *) * + unprocess_num); + } + + return unprocess_num; +} + +uint16_t +rte_gro_timeout_flush(void *ctx, + uint64_t timeout_cycles, + uint64_t gro_types, + struct rte_mbuf **out, + uint16_t max_nb_out) +{ + struct gro_ctx *gro_ctx = ctx; + uint64_t flush_timestamp; + uint16_t num = 0; + + gro_types = gro_types & gro_ctx->gro_types; + flush_timestamp = rte_rdtsc() - timeout_cycles; + + if (gro_types & RTE_GRO_IPV4_VXLAN_TCP_IPV4) { + num = gro_vxlan_tcp4_tbl_timeout_flush(gro_ctx->tbls[ + RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX], + flush_timestamp, out, max_nb_out); + max_nb_out -= num; + } + + /* If no available space in 'out', stop flushing. */ + if ((gro_types & RTE_GRO_TCP_IPV4) && max_nb_out > 0) { + num += gro_tcp4_tbl_timeout_flush( + gro_ctx->tbls[RTE_GRO_TCP_IPV4_INDEX], + flush_timestamp, + &out[num], max_nb_out); + } + + return num; +} + +uint64_t +rte_gro_get_pkt_count(void *ctx) +{ + struct gro_ctx *gro_ctx = ctx; + gro_tbl_pkt_count_fn pkt_count_fn; + uint64_t gro_types = gro_ctx->gro_types, flag; + uint64_t item_num = 0; + uint8_t i; + + for (i = 0; i < RTE_GRO_TYPE_MAX_NUM && gro_types; i++) { + flag = 1ULL << i; + if ((gro_types & flag) == 0) + continue; + + gro_types ^= flag; + pkt_count_fn = tbl_pkt_count_fn[i]; + if (pkt_count_fn) + item_num += pkt_count_fn(gro_ctx->tbls[i]); + } + + return item_num; +} diff --git a/src/spdk/dpdk/lib/librte_gro/rte_gro.h b/src/spdk/dpdk/lib/librte_gro/rte_gro.h new file mode 100644 index 00000000..8d781b5f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/rte_gro.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_GRO_H_ +#define _RTE_GRO_H_ + +/** + * @file + * Interface to GRO library + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_GRO_MAX_BURST_ITEM_NUM 128U +/**< the max number of packets that rte_gro_reassemble_burst() + * can process in each invocation. + */ +#define RTE_GRO_TYPE_MAX_NUM 64 +/**< the max number of supported GRO types */ +#define RTE_GRO_TYPE_SUPPORT_NUM 2 +/**< the number of currently supported GRO types */ + +#define RTE_GRO_TCP_IPV4_INDEX 0 +#define RTE_GRO_TCP_IPV4 (1ULL << RTE_GRO_TCP_IPV4_INDEX) +/**< TCP/IPv4 GRO flag */ +#define RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX 1 +#define RTE_GRO_IPV4_VXLAN_TCP_IPV4 (1ULL << RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX) +/**< VxLAN GRO flag. */ + +/** + * Structure used to create GRO context objects or used to pass + * application-determined parameters to rte_gro_reassemble_burst(). + */ +struct rte_gro_param { + uint64_t gro_types; + /**< desired GRO types */ + uint16_t max_flow_num; + /**< max flow number */ + uint16_t max_item_per_flow; + /**< max packet number per flow */ + uint16_t socket_id; + /**< socket index for allocating GRO related data structures, + * like reassembly tables. When use rte_gro_reassemble_burst(), + * applications don't need to set this value. + */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * This function create a GRO context object, which is used to merge + * packets in rte_gro_reassemble(). + * + * @param param + * applications use it to pass needed parameters to create a GRO + * context object. + * + * @return + * if create successfully, return a pointer which points to the GRO + * context object. Otherwise, return NULL. + */ +void *rte_gro_ctx_create(const struct rte_gro_param *param); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * This function destroys a GRO context object. + * + * @param ctx + * pointer points to a GRO context object. + */ +void rte_gro_ctx_destroy(void *ctx); + +/** + * This is one of the main reassembly APIs, which merges numbers of + * packets at a time. It doesn't check if input packets have correct + * checksums and doesn't re-calculate checksums for merged packets. + * It assumes the packets are complete (i.e., MF==0 && frag_off==0), + * when IP fragmentation is possible (i.e., DF==0). The GROed packets + * are returned as soon as the function finishes. + * + * @param pkts + * Pointer array pointing to the packets to reassemble. Besides, it + * keeps MBUF addresses for the GROed packets. + * @param nb_pkts + * The number of packets to reassemble + * @param param + * Application-determined parameters for reassembling packets. + * + * @return + * The number of packets after been GROed. If no packets are merged, + * the return value is equals to nb_pkts. + */ +uint16_t rte_gro_reassemble_burst(struct rte_mbuf **pkts, + uint16_t nb_pkts, + const struct rte_gro_param *param); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reassembly function, which tries to merge input packets with the + * existed packets in the reassembly tables of a given GRO context. + * It doesn't check if input packets have correct checksums and doesn't + * re-calculate checksums for merged packets. Additionally, it assumes + * the packets are complete (i.e., MF==0 && frag_off==0), when IP + * fragmentation is possible (i.e., DF==0). + * + * If the input packets have invalid parameters (e.g. no data payload, + * unsupported GRO types), they are returned to applications. Otherwise, + * they are either merged or inserted into the table. Applications need + * to flush packets from the tables by flush API, if they want to get the + * GROed packets. + * + * @param pkts + * Packets to reassemble. It's also used to store the unprocessed packets. + * @param nb_pkts + * The number of packets to reassemble + * @param ctx + * GRO context object pointer + * + * @return + * The number of unprocessed packets. + */ +uint16_t rte_gro_reassemble(struct rte_mbuf **pkts, + uint16_t nb_pkts, + void *ctx); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * This function flushes the timeout packets from the reassembly tables + * of desired GRO types. The max number of flushed packets is the + * element number of 'out'. + * + * Additionally, the flushed packets may have incorrect checksums, since + * this function doesn't re-calculate checksums for merged packets. + * + * @param ctx + * GRO context object pointer. + * @param timeout_cycles + * The max TTL for packets in reassembly tables, measured in nanosecond. + * @param gro_types + * This function flushes packets whose GRO types are specified by + * gro_types. + * @param out + * Pointer array used to keep flushed packets. + * @param max_nb_out + * The element number of 'out'. It's also the max number of timeout + * packets that can be flushed finally. + * + * @return + * The number of flushed packets. + */ +uint16_t rte_gro_timeout_flush(void *ctx, + uint64_t timeout_cycles, + uint64_t gro_types, + struct rte_mbuf **out, + uint16_t max_nb_out); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * This function returns the number of packets in all reassembly tables + * of a given GRO context. + * + * @param ctx + * GRO context object pointer. + * + * @return + * The number of packets in the tables. + */ +uint64_t rte_gro_get_pkt_count(void *ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_GRO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_gro/rte_gro_version.map b/src/spdk/dpdk/lib/librte_gro/rte_gro_version.map new file mode 100644 index 00000000..1606b6dc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gro/rte_gro_version.map @@ -0,0 +1,12 @@ +DPDK_17.08 { + global: + + rte_gro_ctx_create; + rte_gro_ctx_destroy; + rte_gro_get_pkt_count; + rte_gro_reassemble; + rte_gro_reassemble_burst; + rte_gro_timeout_flush; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_gso/Makefile b/src/spdk/dpdk/lib/librte_gso/Makefile new file mode 100644 index 00000000..1fac53a8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_gso.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_eal -lrte_mbuf -lrte_ethdev -lrte_net +LDLIBS += -lrte_mempool + +EXPORT_MAP := rte_gso_version.map + +LIBABIVER := 1 + +#source files +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tunnel_tcp4.c +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_udp4.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_gso/gso_common.c b/src/spdk/dpdk/lib/librte_gso/gso_common.c new file mode 100644 index 00000000..0fad1132 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_common.c @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include + +#include +#include + +#include "gso_common.h" + +static inline void +hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt, + uint16_t pkt_hdr_offset) +{ + /* Copy MBUF metadata */ + hdr_segment->nb_segs = 1; + hdr_segment->port = pkt->port; + hdr_segment->ol_flags = pkt->ol_flags; + hdr_segment->packet_type = pkt->packet_type; + hdr_segment->pkt_len = pkt_hdr_offset; + hdr_segment->data_len = pkt_hdr_offset; + hdr_segment->tx_offload = pkt->tx_offload; + + /* Copy the packet header */ + rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *), + rte_pktmbuf_mtod(pkt, char *), + pkt_hdr_offset); +} + +static inline void +free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts) +{ + uint16_t i; + + for (i = 0; i < nb_pkts; i++) + rte_pktmbuf_free(pkts[i]); +} + +int +gso_do_segment(struct rte_mbuf *pkt, + uint16_t pkt_hdr_offset, + uint16_t pyld_unit_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct rte_mbuf *pkt_in; + struct rte_mbuf *hdr_segment, *pyld_segment, *prev_segment; + uint16_t pkt_in_data_pos, segment_bytes_remaining; + uint16_t pyld_len, nb_segs; + bool more_in_pkt, more_out_segs; + + pkt_in = pkt; + nb_segs = 0; + more_in_pkt = 1; + pkt_in_data_pos = pkt_hdr_offset; + + while (more_in_pkt) { + if (unlikely(nb_segs >= nb_pkts_out)) { + free_gso_segment(pkts_out, nb_segs); + return -EINVAL; + } + + /* Allocate a direct MBUF */ + hdr_segment = rte_pktmbuf_alloc(direct_pool); + if (unlikely(hdr_segment == NULL)) { + free_gso_segment(pkts_out, nb_segs); + return -ENOMEM; + } + /* Fill the packet header */ + hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset); + + prev_segment = hdr_segment; + segment_bytes_remaining = pyld_unit_size; + more_out_segs = 1; + + while (more_out_segs && more_in_pkt) { + /* Allocate an indirect MBUF */ + pyld_segment = rte_pktmbuf_alloc(indirect_pool); + if (unlikely(pyld_segment == NULL)) { + rte_pktmbuf_free(hdr_segment); + free_gso_segment(pkts_out, nb_segs); + return -ENOMEM; + } + /* Attach to current MBUF segment of pkt */ + rte_pktmbuf_attach(pyld_segment, pkt_in); + + prev_segment->next = pyld_segment; + prev_segment = pyld_segment; + + pyld_len = segment_bytes_remaining; + if (pyld_len + pkt_in_data_pos > pkt_in->data_len) + pyld_len = pkt_in->data_len - pkt_in_data_pos; + + pyld_segment->data_off = pkt_in_data_pos + + pkt_in->data_off; + pyld_segment->data_len = pyld_len; + + /* Update header segment */ + hdr_segment->pkt_len += pyld_len; + hdr_segment->nb_segs++; + + pkt_in_data_pos += pyld_len; + segment_bytes_remaining -= pyld_len; + + /* Finish processing a MBUF segment of pkt */ + if (pkt_in_data_pos == pkt_in->data_len) { + pkt_in = pkt_in->next; + pkt_in_data_pos = 0; + if (pkt_in == NULL) + more_in_pkt = 0; + } + + /* Finish generating a GSO segment */ + if (segment_bytes_remaining == 0) + more_out_segs = 0; + } + pkts_out[nb_segs++] = hdr_segment; + } + return nb_segs; +} diff --git a/src/spdk/dpdk/lib/librte_gso/gso_common.h b/src/spdk/dpdk/lib/librte_gso/gso_common.h new file mode 100644 index 00000000..6cd764ff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_common.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _GSO_COMMON_H_ +#define _GSO_COMMON_H_ + +#include + +#include +#include +#include +#include + +#define IS_FRAGMENTED(frag_off) (((frag_off) & IPV4_HDR_OFFSET_MASK) != 0 \ + || ((frag_off) & IPV4_HDR_MF_FLAG) == IPV4_HDR_MF_FLAG) + +#define TCP_HDR_PSH_MASK ((uint8_t)0x08) +#define TCP_HDR_FIN_MASK ((uint8_t)0x01) + +#define IS_IPV4_TCP(flag) (((flag) & (PKT_TX_TCP_SEG | PKT_TX_IPV4)) == \ + (PKT_TX_TCP_SEG | PKT_TX_IPV4)) + +#define IS_IPV4_VXLAN_TCP4(flag) (((flag) & (PKT_TX_TCP_SEG | PKT_TX_IPV4 | \ + PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN)) == \ + (PKT_TX_TCP_SEG | PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | \ + PKT_TX_TUNNEL_VXLAN)) + +#define IS_IPV4_GRE_TCP4(flag) (((flag) & (PKT_TX_TCP_SEG | PKT_TX_IPV4 | \ + PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_GRE)) == \ + (PKT_TX_TCP_SEG | PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | \ + PKT_TX_TUNNEL_GRE)) + +#define IS_IPV4_UDP(flag) (((flag) & (PKT_TX_UDP_SEG | PKT_TX_IPV4)) == \ + (PKT_TX_UDP_SEG | PKT_TX_IPV4)) + +/** + * Internal function which updates the UDP header of a packet, following + * segmentation. This is required to update the header's datagram length field. + * + * @param pkt + * The packet containing the UDP header. + * @param udp_offset + * The offset of the UDP header from the start of the packet. + */ +static inline void +update_udp_header(struct rte_mbuf *pkt, uint16_t udp_offset) +{ + struct udp_hdr *udp_hdr; + + udp_hdr = (struct udp_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + udp_offset); + udp_hdr->dgram_len = rte_cpu_to_be_16(pkt->pkt_len - udp_offset); +} + +/** + * Internal function which updates the TCP header of a packet, following + * segmentation. This is required to update the header's 'sent' sequence + * number, and also to clear 'PSH' and 'FIN' flags for non-tail segments. + * + * @param pkt + * The packet containing the TCP header. + * @param l4_offset + * The offset of the TCP header from the start of the packet. + * @param sent_seq + * The sent sequence number. + * @param non-tail + * Indicates whether or not this is a tail segment. + */ +static inline void +update_tcp_header(struct rte_mbuf *pkt, uint16_t l4_offset, uint32_t sent_seq, + uint8_t non_tail) +{ + struct tcp_hdr *tcp_hdr; + + tcp_hdr = (struct tcp_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + l4_offset); + tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq); + if (likely(non_tail)) + tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | + TCP_HDR_FIN_MASK)); +} + +/** + * Internal function which updates the IPv4 header of a packet, following + * segmentation. This is required to update the header's 'total_length' field, + * to reflect the reduced length of the now-segmented packet. Furthermore, the + * header's 'packet_id' field must be updated to reflect the new ID of the + * now-segmented packet. + * + * @param pkt + * The packet containing the IPv4 header. + * @param l3_offset + * The offset of the IPv4 header from the start of the packet. + * @param id + * The new ID of the packet. + */ +static inline void +update_ipv4_header(struct rte_mbuf *pkt, uint16_t l3_offset, uint16_t id) +{ + struct ipv4_hdr *ipv4_hdr; + + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + l3_offset); + ipv4_hdr->total_length = rte_cpu_to_be_16(pkt->pkt_len - l3_offset); + ipv4_hdr->packet_id = rte_cpu_to_be_16(id); +} + +/** + * Internal function which divides the input packet into small segments. + * Each of the newly-created segments is organized as a two-segment MBUF, + * where the first segment is a standard mbuf, which stores a copy of + * packet header, and the second is an indirect mbuf which points to a + * section of data in the input packet. + * + * @param pkt + * Packet to segment. + * @param pkt_hdr_offset + * Packet header offset, measured in bytes. + * @param pyld_unit_size + * The max payload length of a GSO segment. + * @param direct_pool + * MBUF pool used for allocating direct buffers for output segments. + * @param indirect_pool + * MBUF pool used for allocating indirect buffers for output segments. + * @param pkts_out + * Pointer array used to keep the mbuf addresses of output segments. If + * the memory space in pkts_out is insufficient, gso_do_segment() fails + * and returns -EINVAL. + * @param nb_pkts_out + * The max number of items that pkts_out can keep. + * + * @return + * - The number of segments created in the event of success. + * - Return -ENOMEM if run out of memory in MBUF pools. + * - Return -EINVAL for invalid parameters. + */ +int gso_do_segment(struct rte_mbuf *pkt, + uint16_t pkt_hdr_offset, + uint16_t pyld_unit_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out); +#endif diff --git a/src/spdk/dpdk/lib/librte_gso/gso_tcp4.c b/src/spdk/dpdk/lib/librte_gso/gso_tcp4.c new file mode 100644 index 00000000..fbd95f8f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_tcp4.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include "gso_common.h" +#include "gso_tcp4.h" + +static void +update_ipv4_tcp_headers(struct rte_mbuf *pkt, uint8_t ipid_delta, + struct rte_mbuf **segs, uint16_t nb_segs) +{ + struct ipv4_hdr *ipv4_hdr; + struct tcp_hdr *tcp_hdr; + uint32_t sent_seq; + uint16_t id, tail_idx, i; + uint16_t l3_offset = pkt->l2_len; + uint16_t l4_offset = l3_offset + pkt->l3_len; + + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char*) + + l3_offset); + tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len); + id = rte_be_to_cpu_16(ipv4_hdr->packet_id); + sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq); + tail_idx = nb_segs - 1; + + for (i = 0; i < nb_segs; i++) { + update_ipv4_header(segs[i], l3_offset, id); + update_tcp_header(segs[i], l4_offset, sent_seq, i < tail_idx); + id += ipid_delta; + sent_seq += (segs[i]->pkt_len - segs[i]->data_len); + } +} + +int +gso_tcp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + uint8_t ipid_delta, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t pyld_unit_size, hdr_offset; + uint16_t frag_off; + int ret; + + /* Don't process the fragmented packet */ + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + pkt->l2_len); + frag_off = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); + if (unlikely(IS_FRAGMENTED(frag_off))) { + pkts_out[0] = pkt; + return 1; + } + + /* Don't process the packet without data */ + hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len; + if (unlikely(hdr_offset >= pkt->pkt_len)) { + pkts_out[0] = pkt; + return 1; + } + + pyld_unit_size = gso_size - hdr_offset; + + /* Segment the payload */ + ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool, + indirect_pool, pkts_out, nb_pkts_out); + if (ret > 1) + update_ipv4_tcp_headers(pkt, ipid_delta, pkts_out, ret); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_gso/gso_tcp4.h b/src/spdk/dpdk/lib/librte_gso/gso_tcp4.h new file mode 100644 index 00000000..10d4b3ac --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_tcp4.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _GSO_TCP4_H_ +#define _GSO_TCP4_H_ + +#include +#include + +/** + * Segment an IPv4/TCP packet. This function doesn't check if the input + * packet has correct checksums, and doesn't update checksums for output + * GSO segments. Furthermore, it doesn't process IP fragment packets. + * + * @param pkt + * The packet mbuf to segment. + * @param gso_size + * The max length of a GSO segment, measured in bytes. + * @param ipid_delta + * The increasing unit of IP ids. + * @param direct_pool + * MBUF pool used for allocating direct buffers for output segments. + * @param indirect_pool + * MBUF pool used for allocating indirect buffers for output segments. + * @param pkts_out + * Pointer array used to store the MBUF addresses of output GSO + * segments, when the function succeeds. If the memory space in + * pkts_out is insufficient, it fails and returns -EINVAL. + * @param nb_pkts_out + * The max number of items that 'pkts_out' can keep. + * + * @return + * - The number of GSO segments filled in pkts_out on success. + * - Return -ENOMEM if run out of memory in MBUF pools. + * - Return -EINVAL for invalid parameters. + */ +int gso_tcp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + uint8_t ip_delta, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out); +#endif diff --git a/src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.c b/src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.c new file mode 100644 index 00000000..d39b4686 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include "gso_common.h" +#include "gso_tunnel_tcp4.h" + +static void +update_tunnel_ipv4_tcp_headers(struct rte_mbuf *pkt, uint8_t ipid_delta, + struct rte_mbuf **segs, uint16_t nb_segs) +{ + struct ipv4_hdr *ipv4_hdr; + struct tcp_hdr *tcp_hdr; + uint32_t sent_seq; + uint16_t outer_id, inner_id, tail_idx, i; + uint16_t outer_ipv4_offset, inner_ipv4_offset; + uint16_t udp_gre_offset, tcp_offset; + uint8_t update_udp_hdr; + + outer_ipv4_offset = pkt->outer_l2_len; + udp_gre_offset = outer_ipv4_offset + pkt->outer_l3_len; + inner_ipv4_offset = udp_gre_offset + pkt->l2_len; + tcp_offset = inner_ipv4_offset + pkt->l3_len; + + /* Outer IPv4 header. */ + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + outer_ipv4_offset); + outer_id = rte_be_to_cpu_16(ipv4_hdr->packet_id); + + /* Inner IPv4 header. */ + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + inner_ipv4_offset); + inner_id = rte_be_to_cpu_16(ipv4_hdr->packet_id); + + tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len); + sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq); + tail_idx = nb_segs - 1; + + /* Only update UDP header for VxLAN packets. */ + update_udp_hdr = (pkt->ol_flags & PKT_TX_TUNNEL_VXLAN) ? 1 : 0; + + for (i = 0; i < nb_segs; i++) { + update_ipv4_header(segs[i], outer_ipv4_offset, outer_id); + if (update_udp_hdr) + update_udp_header(segs[i], udp_gre_offset); + update_ipv4_header(segs[i], inner_ipv4_offset, inner_id); + update_tcp_header(segs[i], tcp_offset, sent_seq, i < tail_idx); + outer_id++; + inner_id += ipid_delta; + sent_seq += (segs[i]->pkt_len - segs[i]->data_len); + } +} + +int +gso_tunnel_tcp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + uint8_t ipid_delta, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct ipv4_hdr *inner_ipv4_hdr; + uint16_t pyld_unit_size, hdr_offset, frag_off; + int ret = 1; + + hdr_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len; + inner_ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + hdr_offset); + /* + * Don't process the packet whose MF bit or offset in the inner + * IPv4 header are non-zero. + */ + frag_off = rte_be_to_cpu_16(inner_ipv4_hdr->fragment_offset); + if (unlikely(IS_FRAGMENTED(frag_off))) { + pkts_out[0] = pkt; + return 1; + } + + hdr_offset += pkt->l3_len + pkt->l4_len; + /* Don't process the packet without data */ + if (hdr_offset >= pkt->pkt_len) { + pkts_out[0] = pkt; + return 1; + } + pyld_unit_size = gso_size - hdr_offset; + + /* Segment the payload */ + ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool, + indirect_pool, pkts_out, nb_pkts_out); + if (ret <= 1) + return ret; + + update_tunnel_ipv4_tcp_headers(pkt, ipid_delta, pkts_out, ret); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.h b/src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.h new file mode 100644 index 00000000..c522ed27 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_tunnel_tcp4.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _GSO_TUNNEL_TCP4_H_ +#define _GSO_TUNNEL_TCP4_H_ + +#include +#include + +/** + * Segment a tunneling packet with inner TCP/IPv4 headers. This function + * doesn't check if the input packet has correct checksums, and doesn't + * update checksums for output GSO segments. Furthermore, it doesn't + * process IP fragment packets. + * + * @param pkt + * The packet mbuf to segment. + * @param gso_size + * The max length of a GSO segment, measured in bytes. + * @param ipid_delta + * The increasing unit of IP ids. + * @param direct_pool + * MBUF pool used for allocating direct buffers for output segments. + * @param indirect_pool + * MBUF pool used for allocating indirect buffers for output segments. + * @param pkts_out + * Pointer array used to store the MBUF addresses of output GSO + * segments, when it succeeds. If the memory space in pkts_out is + * insufficient, it fails and returns -EINVAL. + * @param nb_pkts_out + * The max number of items that 'pkts_out' can keep. + * + * @return + * - The number of GSO segments filled in pkts_out on success. + * - Return -ENOMEM if run out of memory in MBUF pools. + * - Return -EINVAL for invalid parameters. + */ +int gso_tunnel_tcp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + uint8_t ipid_delta, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out); +#endif diff --git a/src/spdk/dpdk/lib/librte_gso/gso_udp4.c b/src/spdk/dpdk/lib/librte_gso/gso_udp4.c new file mode 100644 index 00000000..927dee12 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_udp4.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include "gso_common.h" +#include "gso_udp4.h" + +#define IPV4_HDR_MF_BIT (1U << 13) + +static inline void +update_ipv4_udp_headers(struct rte_mbuf *pkt, struct rte_mbuf **segs, + uint16_t nb_segs) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t frag_offset = 0, is_mf; + uint16_t l2_hdrlen = pkt->l2_len, l3_hdrlen = pkt->l3_len; + uint16_t tail_idx = nb_segs - 1, length, i; + + /* + * Update IP header fields for output segments. Specifically, + * keep the same IP id, update fragment offset and total + * length. + */ + for (i = 0; i < nb_segs; i++) { + ipv4_hdr = rte_pktmbuf_mtod_offset(segs[i], struct ipv4_hdr *, + l2_hdrlen); + length = segs[i]->pkt_len - l2_hdrlen; + ipv4_hdr->total_length = rte_cpu_to_be_16(length); + + is_mf = i < tail_idx ? IPV4_HDR_MF_BIT : 0; + ipv4_hdr->fragment_offset = + rte_cpu_to_be_16(frag_offset | is_mf); + frag_offset += ((length - l3_hdrlen) >> 3); + } +} + +int +gso_udp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t pyld_unit_size, hdr_offset; + uint16_t frag_off; + int ret; + + /* Don't process the fragmented packet */ + ipv4_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, + pkt->l2_len); + frag_off = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); + if (unlikely(IS_FRAGMENTED(frag_off))) { + pkts_out[0] = pkt; + return 1; + } + + /* + * UDP fragmentation is the same as IP fragmentation. + * Except the first one, other output packets just have l2 + * and l3 headers. + */ + hdr_offset = pkt->l2_len + pkt->l3_len; + + /* Don't process the packet without data. */ + if (unlikely(hdr_offset + pkt->l4_len >= pkt->pkt_len)) { + pkts_out[0] = pkt; + return 1; + } + + pyld_unit_size = gso_size - hdr_offset; + + /* Segment the payload */ + ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool, + indirect_pool, pkts_out, nb_pkts_out); + if (ret > 1) + update_ipv4_udp_headers(pkt, pkts_out, ret); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_gso/gso_udp4.h b/src/spdk/dpdk/lib/librte_gso/gso_udp4.h new file mode 100644 index 00000000..b2a2908e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/gso_udp4.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _GSO_UDP4_H_ +#define _GSO_UDP4_H_ + +#include +#include + +/** + * Segment an UDP/IPv4 packet. This function doesn't check if the input + * packet has correct checksums, and doesn't update checksums for output + * GSO segments. Furthermore, it doesn't process IP fragment packets. + * + * @param pkt + * The packet mbuf to segment. + * @param gso_size + * The max length of a GSO segment, measured in bytes. + * @param direct_pool + * MBUF pool used for allocating direct buffers for output segments. + * @param indirect_pool + * MBUF pool used for allocating indirect buffers for output segments. + * @param pkts_out + * Pointer array used to store the MBUF addresses of output GSO + * segments, when the function succeeds. If the memory space in + * pkts_out is insufficient, it fails and returns -EINVAL. + * @param nb_pkts_out + * The max number of items that 'pkts_out' can keep. + * + * @return + * - The number of GSO segments filled in pkts_out on success. + * - Return -ENOMEM if run out of memory in MBUF pools. + * - Return -EINVAL for invalid parameters. + */ +int gso_udp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out); +#endif diff --git a/src/spdk/dpdk/lib/librte_gso/meson.build b/src/spdk/dpdk/lib/librte_gso/meson.build new file mode 100644 index 00000000..ad8dd858 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('gso_common.c', 'gso_tcp4.c', 'gso_udp4.c', + 'gso_tunnel_tcp4.c', 'rte_gso.c') +headers = files('rte_gso.h') +deps += ['ethdev'] diff --git a/src/spdk/dpdk/lib/librte_gso/rte_gso.c b/src/spdk/dpdk/lib/librte_gso/rte_gso.c new file mode 100644 index 00000000..751b5b62 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/rte_gso.c @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include + +#include +#include + +#include "rte_gso.h" +#include "gso_common.h" +#include "gso_tcp4.h" +#include "gso_tunnel_tcp4.h" +#include "gso_udp4.h" + +#define ILLEGAL_UDP_GSO_CTX(ctx) \ + ((((ctx)->gso_types & DEV_TX_OFFLOAD_UDP_TSO) == 0) || \ + (ctx)->gso_size < RTE_GSO_UDP_SEG_SIZE_MIN) + +#define ILLEGAL_TCP_GSO_CTX(ctx) \ + ((((ctx)->gso_types & (DEV_TX_OFFLOAD_TCP_TSO | \ + DEV_TX_OFFLOAD_VXLAN_TNL_TSO | \ + DEV_TX_OFFLOAD_GRE_TNL_TSO)) == 0) || \ + (ctx)->gso_size < RTE_GSO_SEG_SIZE_MIN) + +int +rte_gso_segment(struct rte_mbuf *pkt, + const struct rte_gso_ctx *gso_ctx, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct rte_mempool *direct_pool, *indirect_pool; + struct rte_mbuf *pkt_seg; + uint64_t ol_flags; + uint16_t gso_size; + uint8_t ipid_delta; + int ret = 1; + + if (pkt == NULL || pkts_out == NULL || gso_ctx == NULL || + nb_pkts_out < 1 || + (ILLEGAL_UDP_GSO_CTX(gso_ctx) && + ILLEGAL_TCP_GSO_CTX(gso_ctx))) + return -EINVAL; + + if (gso_ctx->gso_size >= pkt->pkt_len) { + pkt->ol_flags &= (~(PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)); + pkts_out[0] = pkt; + return 1; + } + + direct_pool = gso_ctx->direct_pool; + indirect_pool = gso_ctx->indirect_pool; + gso_size = gso_ctx->gso_size; + ipid_delta = (gso_ctx->flag != RTE_GSO_FLAG_IPID_FIXED); + ol_flags = pkt->ol_flags; + + if ((IS_IPV4_VXLAN_TCP4(pkt->ol_flags) && + (gso_ctx->gso_types & DEV_TX_OFFLOAD_VXLAN_TNL_TSO)) || + ((IS_IPV4_GRE_TCP4(pkt->ol_flags) && + (gso_ctx->gso_types & DEV_TX_OFFLOAD_GRE_TNL_TSO)))) { + pkt->ol_flags &= (~PKT_TX_TCP_SEG); + ret = gso_tunnel_tcp4_segment(pkt, gso_size, ipid_delta, + direct_pool, indirect_pool, + pkts_out, nb_pkts_out); + } else if (IS_IPV4_TCP(pkt->ol_flags) && + (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO)) { + pkt->ol_flags &= (~PKT_TX_TCP_SEG); + ret = gso_tcp4_segment(pkt, gso_size, ipid_delta, + direct_pool, indirect_pool, + pkts_out, nb_pkts_out); + } else if (IS_IPV4_UDP(pkt->ol_flags) && + (gso_ctx->gso_types & DEV_TX_OFFLOAD_UDP_TSO)) { + pkt->ol_flags &= (~PKT_TX_UDP_SEG); + ret = gso_udp4_segment(pkt, gso_size, direct_pool, + indirect_pool, pkts_out, nb_pkts_out); + } else { + /* unsupported packet, skip */ + pkts_out[0] = pkt; + RTE_LOG(DEBUG, GSO, "Unsupported packet type\n"); + return 1; + } + + if (ret > 1) { + pkt_seg = pkt; + while (pkt_seg) { + rte_mbuf_refcnt_update(pkt_seg, -1); + pkt_seg = pkt_seg->next; + } + } else if (ret < 0) { + /* Revert the ol_flags in the event of failure. */ + pkt->ol_flags = ol_flags; + } + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_gso/rte_gso.h b/src/spdk/dpdk/lib/librte_gso/rte_gso.h new file mode 100644 index 00000000..a626a11e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/rte_gso.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_GSO_H_ +#define _RTE_GSO_H_ + +/** + * @file + * Interface to GSO library + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Minimum GSO segment size for TCP based packets. */ +#define RTE_GSO_SEG_SIZE_MIN (sizeof(struct ether_hdr) + \ + sizeof(struct ipv4_hdr) + sizeof(struct tcp_hdr) + 1) + +/* Minimum GSO segment size for UDP based packets. */ +#define RTE_GSO_UDP_SEG_SIZE_MIN (sizeof(struct ether_hdr) + \ + sizeof(struct ipv4_hdr) + sizeof(struct udp_hdr) + 1) + +/* GSO flags for rte_gso_ctx. */ +#define RTE_GSO_FLAG_IPID_FIXED (1ULL << 0) +/**< Use fixed IP ids for output GSO segments. Setting + * 0 indicates using incremental IP ids. + */ + +/** + * GSO context structure. + */ +struct rte_gso_ctx { + struct rte_mempool *direct_pool; + /**< MBUF pool for allocating direct buffers, which are used + * to store packet headers for GSO segments. + */ + struct rte_mempool *indirect_pool; + /**< MBUF pool for allocating indirect buffers, which are used + * to locate packet payloads for GSO segments. The indirect + * buffer doesn't contain any data, but simply points to an + * offset within the packet to segment. + */ + uint64_t flag; + /**< flag that controls specific attributes of output segments, + * such as the type of IP ID generated (i.e. fixed or incremental). + */ + uint32_t gso_types; + /**< the bit mask of required GSO types. The GSO library + * uses the same macros as that of describing device TX + * offloading capabilities (i.e. DEV_TX_OFFLOAD_*_TSO) for + * gso_types. + * + * For example, if applications want to segment TCP/IPv4 + * packets, set DEV_TX_OFFLOAD_TCP_TSO in gso_types. + */ + uint16_t gso_size; + /**< maximum size of an output GSO segment, including packet + * header and payload, measured in bytes. Must exceed + * RTE_GSO_SEG_SIZE_MIN. + */ +}; + +/** + * Segmentation function, which supports processing of both single- and + * multi- MBUF packets. + * + * Note that we refer to the packets that are segmented from the input + * packet as 'GSO segments'. rte_gso_segment() doesn't check if the + * input packet has correct checksums, and doesn't update checksums for + * output GSO segments. Additionally, it doesn't process IP fragment + * packets. + * + * Before calling rte_gso_segment(), applications must set proper ol_flags + * for the packet. The GSO library uses the same macros as that of TSO. + * For example, set PKT_TX_TCP_SEG and PKT_TX_IPV4 in ol_flags to segment + * a TCP/IPv4 packet. If rte_gso_segment() succeeds, the PKT_TX_TCP_SEG + * flag is removed for all GSO segments and the input packet. + * + * Each of the newly-created GSO segments is organized as a two-segment + * MBUF, where the first segment is a standard MBUF, which stores a copy + * of packet header, and the second is an indirect MBUF which points to + * a section of data in the input packet. Since each GSO segment has + * multiple MBUFs (i.e. typically 2 MBUFs), the driver of the interface which + * the GSO segments are sent to should support transmission of multi-segment + * packets. + * + * If the input packet is GSO'd, its mbuf refcnt reduces by 1. Therefore, + * when all GSO segments are freed, the input packet is freed automatically. + * + * If the memory space in pkts_out or MBUF pools is insufficient, this + * function fails, and it returns (-1) * errno. Otherwise, GSO succeeds, + * and this function returns the number of output GSO segments filled in + * pkts_out. + * + * @param pkt + * The packet mbuf to segment. + * @param ctx + * GSO context object pointer. + * @param pkts_out + * Pointer array used to store the MBUF addresses of output GSO + * segments, when rte_gso_segment() succeeds. + * @param nb_pkts_out + * The max number of items that pkts_out can keep. + * + * @return + * - The number of GSO segments filled in pkts_out on success. + * - Return -ENOMEM if run out of memory in MBUF pools. + * - Return -EINVAL for invalid parameters. + */ +int rte_gso_segment(struct rte_mbuf *pkt, + const struct rte_gso_ctx *ctx, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out); +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_GSO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_gso/rte_gso_version.map b/src/spdk/dpdk/lib/librte_gso/rte_gso_version.map new file mode 100644 index 00000000..e1fd453e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_gso/rte_gso_version.map @@ -0,0 +1,7 @@ +DPDK_17.11 { + global: + + rte_gso_segment; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_hash/Makefile b/src/spdk/dpdk/lib/librte_hash/Makefile new file mode 100644 index 00000000..c8c435df --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2015 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_hash.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal -lrte_ring + +EXPORT_MAP := rte_hash_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_HASH) := rte_cuckoo_hash.c +SRCS-$(CONFIG_RTE_LIBRTE_HASH) += rte_fbk_hash.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_HASH)-include := rte_hash.h +SYMLINK-$(CONFIG_RTE_LIBRTE_HASH)-include += rte_hash_crc.h +ifeq ($(CONFIG_RTE_ARCH_ARM64),y) +ifneq ($(findstring RTE_MACHINE_CPUFLAG_CRC32,$(CFLAGS)),) +SYMLINK-$(CONFIG_RTE_LIBRTE_HASH)-include += rte_crc_arm64.h +endif +endif +SYMLINK-$(CONFIG_RTE_LIBRTE_HASH)-include += rte_jhash.h +SYMLINK-$(CONFIG_RTE_LIBRTE_HASH)-include += rte_thash.h +SYMLINK-$(CONFIG_RTE_LIBRTE_HASH)-include += rte_fbk_hash.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_hash/meson.build b/src/spdk/dpdk/lib/librte_hash/meson.build new file mode 100644 index 00000000..efc06ede --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +headers = files('rte_cmp_arm64.h', + 'rte_cmp_x86.h', + 'rte_crc_arm64.h', + 'rte_cuckoo_hash.h', + 'rte_fbk_hash.h', + 'rte_hash_crc.h', + 'rte_hash.h', + 'rte_jhash.h', + 'rte_thash.h') + +sources = files('rte_cuckoo_hash.c', 'rte_fbk_hash.c') +deps += ['ring'] diff --git a/src/spdk/dpdk/lib/librte_hash/rte_cmp_arm64.h b/src/spdk/dpdk/lib/librte_hash/rte_cmp_arm64.h new file mode 100644 index 00000000..e9e26f9a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_cmp_arm64.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +/* Functions to compare multiple of 16 byte keys (up to 128 bytes) */ +static int +rte_hash_k16_cmp_eq(const void *key1, const void *key2, + size_t key_len __rte_unused) +{ + uint64_t x0, x1, y0, y1; + + asm volatile( + "ldp %x[x1], %x[x0], [%x[p1]]" + : [x1]"=r"(x1), [x0]"=r"(x0) + : [p1]"r"(key1) + ); + asm volatile( + "ldp %x[y1], %x[y0], [%x[p2]]" + : [y1]"=r"(y1), [y0]"=r"(y0) + : [p2]"r"(key2) + ); + x0 ^= y0; + x1 ^= y1; + return !(x0 == 0 && x1 == 0); +} + +static int +rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k16_cmp_eq(key1, key2, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 16, + (const char *) key2 + 16, key_len); +} + +static int +rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k16_cmp_eq(key1, key2, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 16, + (const char *) key2 + 16, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 32, + (const char *) key2 + 32, key_len); +} + +static int +rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k32_cmp_eq(key1, key2, key_len) || + rte_hash_k32_cmp_eq((const char *) key1 + 32, + (const char *) key2 + 32, key_len); +} + +static int +rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len); +} + +static int +rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k32_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len); +} + +static int +rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k32_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 96, + (const char *) key2 + 96, key_len); +} + +static int +rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k64_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len); +} diff --git a/src/spdk/dpdk/lib/librte_hash/rte_cmp_x86.h b/src/spdk/dpdk/lib/librte_hash/rte_cmp_x86.h new file mode 100644 index 00000000..e82b4c08 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_cmp_x86.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +/* Functions to compare multiple of 16 byte keys (up to 128 bytes) */ +static int +rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused) +{ + const __m128i k1 = _mm_loadu_si128((const __m128i *) key1); + const __m128i k2 = _mm_loadu_si128((const __m128i *) key2); + const __m128i x = _mm_xor_si128(k1, k2); + + return !_mm_test_all_zeros(x, x); +} + +static int +rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k16_cmp_eq(key1, key2, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 16, + (const char *) key2 + 16, key_len); +} + +static int +rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k16_cmp_eq(key1, key2, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 16, + (const char *) key2 + 16, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 32, + (const char *) key2 + 32, key_len); +} + +static int +rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k32_cmp_eq(key1, key2, key_len) || + rte_hash_k32_cmp_eq((const char *) key1 + 32, + (const char *) key2 + 32, key_len); +} + +static int +rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len); +} + +static int +rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k32_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len); +} + +static int +rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k32_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len) || + rte_hash_k16_cmp_eq((const char *) key1 + 96, + (const char *) key2 + 96, key_len); +} + +static int +rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len) +{ + return rte_hash_k64_cmp_eq(key1, key2, key_len) || + rte_hash_k64_cmp_eq((const char *) key1 + 64, + (const char *) key2 + 64, key_len); +} diff --git a/src/spdk/dpdk/lib/librte_hash/rte_crc_arm64.h b/src/spdk/dpdk/lib/librte_hash/rte_crc_arm64.h new file mode 100644 index 00000000..b4628cfc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_crc_arm64.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Cavium, Inc + */ + +#ifndef _RTE_CRC_ARM64_H_ +#define _RTE_CRC_ARM64_H_ + +/** + * @file + * + * RTE CRC arm64 Hash + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +static inline uint32_t +crc32c_arm64_u8(uint8_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32cb %w[crc], %w[crc], %w[value]" + : [crc] "+r" (init_val) + : [value] "r" (data)); + return init_val; +} + +static inline uint32_t +crc32c_arm64_u16(uint16_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32ch %w[crc], %w[crc], %w[value]" + : [crc] "+r" (init_val) + : [value] "r" (data)); + return init_val; +} + +static inline uint32_t +crc32c_arm64_u32(uint32_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32cw %w[crc], %w[crc], %w[value]" + : [crc] "+r" (init_val) + : [value] "r" (data)); + return init_val; +} + +static inline uint32_t +crc32c_arm64_u64(uint64_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32cx %w[crc], %w[crc], %x[value]" + : [crc] "+r" (init_val) + : [value] "r" (data)); + return init_val; +} + +/** + * Allow or disallow use of arm64 SIMD instrinsics for CRC32 hash + * calculation. + * + * @param alg + * An OR of following flags: + * - (CRC32_SW) Don't use arm64 crc intrinsics + * - (CRC32_ARM64) Use ARMv8 CRC intrinsic if available + * + */ +static inline void +rte_hash_crc_set_alg(uint8_t alg) +{ + switch (alg) { + case CRC32_ARM64: + if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_CRC32)) + alg = CRC32_SW; + /* fall-through */ + case CRC32_SW: + crc32_alg = alg; + /* fall-through */ + default: + break; + } +} + +/* Setting the best available algorithm */ +RTE_INIT(rte_hash_crc_init_alg) +{ + rte_hash_crc_set_alg(CRC32_ARM64); +} + +/** + * Use single crc32 instruction to perform a hash on a 1 byte value. + * Fall back to software crc32 implementation in case arm64 crc intrinsics is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_1byte(uint8_t data, uint32_t init_val) +{ + if (likely(crc32_alg & CRC32_ARM64)) + return crc32c_arm64_u8(data, init_val); + + return crc32c_1byte(data, init_val); +} + +/** + * Use single crc32 instruction to perform a hash on a 2 bytes value. + * Fall back to software crc32 implementation in case arm64 crc intrinsics is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_2byte(uint16_t data, uint32_t init_val) +{ + if (likely(crc32_alg & CRC32_ARM64)) + return crc32c_arm64_u16(data, init_val); + + return crc32c_2bytes(data, init_val); +} + +/** + * Use single crc32 instruction to perform a hash on a 4 byte value. + * Fall back to software crc32 implementation in case arm64 crc intrinsics is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_4byte(uint32_t data, uint32_t init_val) +{ + if (likely(crc32_alg & CRC32_ARM64)) + return crc32c_arm64_u32(data, init_val); + + return crc32c_1word(data, init_val); +} + +/** + * Use single crc32 instruction to perform a hash on a 8 byte value. + * Fall back to software crc32 implementation in case arm64 crc intrinsics is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_8byte(uint64_t data, uint32_t init_val) +{ + if (likely(crc32_alg == CRC32_ARM64)) + return crc32c_arm64_u64(data, init_val); + + return crc32c_2words(data, init_val); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_CRC_ARM64_H_ */ diff --git a/src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.c b/src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.c new file mode 100644 index 00000000..f7b86c8c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.c @@ -0,0 +1,1344 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_hash.h" +#include "rte_cuckoo_hash.h" + + +TAILQ_HEAD(rte_hash_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_hash_tailq = { + .name = "RTE_HASH", +}; +EAL_REGISTER_TAILQ(rte_hash_tailq) + +struct rte_hash * +rte_hash_find_existing(const char *name) +{ + struct rte_hash *h = NULL; + struct rte_tailq_entry *te; + struct rte_hash_list *hash_list; + + hash_list = RTE_TAILQ_CAST(rte_hash_tailq.head, rte_hash_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, hash_list, next) { + h = (struct rte_hash *) te->data; + if (strncmp(name, h->name, RTE_HASH_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + return h; +} + +void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func) +{ + h->cmp_jump_table_idx = KEY_CUSTOM; + h->rte_hash_custom_cmp_eq = func; +} + +static inline int +rte_hash_cmp_eq(const void *key1, const void *key2, const struct rte_hash *h) +{ + if (h->cmp_jump_table_idx == KEY_CUSTOM) + return h->rte_hash_custom_cmp_eq(key1, key2, h->key_len); + else + return cmp_jump_table[h->cmp_jump_table_idx](key1, key2, h->key_len); +} + +struct rte_hash * +rte_hash_create(const struct rte_hash_parameters *params) +{ + struct rte_hash *h = NULL; + struct rte_tailq_entry *te = NULL; + struct rte_hash_list *hash_list; + struct rte_ring *r = NULL; + char hash_name[RTE_HASH_NAMESIZE]; + void *k = NULL; + void *buckets = NULL; + char ring_name[RTE_RING_NAMESIZE]; + unsigned num_key_slots; + unsigned i; + unsigned int hw_trans_mem_support = 0, multi_writer_support = 0; + unsigned int readwrite_concur_support = 0; + + rte_hash_function default_hash_func = (rte_hash_function)rte_jhash; + + hash_list = RTE_TAILQ_CAST(rte_hash_tailq.head, rte_hash_list); + + if (params == NULL) { + RTE_LOG(ERR, HASH, "rte_hash_create has no parameters\n"); + return NULL; + } + + /* Check for valid parameters */ + if ((params->entries > RTE_HASH_ENTRIES_MAX) || + (params->entries < RTE_HASH_BUCKET_ENTRIES) || + (params->key_len == 0)) { + rte_errno = EINVAL; + RTE_LOG(ERR, HASH, "rte_hash_create has invalid parameters\n"); + return NULL; + } + + /* Check extra flags field to check extra options. */ + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT) + hw_trans_mem_support = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD) + multi_writer_support = 1; + + if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY) { + readwrite_concur_support = 1; + multi_writer_support = 1; + } + + /* Store all keys and leave the first entry as a dummy entry for lookup_bulk */ + if (multi_writer_support) + /* + * Increase number of slots by total number of indices + * that can be stored in the lcore caches + * except for the first cache + */ + num_key_slots = params->entries + (RTE_MAX_LCORE - 1) * + (LCORE_CACHE_SIZE - 1) + 1; + else + num_key_slots = params->entries + 1; + + snprintf(ring_name, sizeof(ring_name), "HT_%s", params->name); + /* Create ring (Dummy slot index is not enqueued) */ + r = rte_ring_create(ring_name, rte_align32pow2(num_key_slots), + params->socket_id, 0); + if (r == NULL) { + RTE_LOG(ERR, HASH, "memory allocation failed\n"); + goto err; + } + + snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* guarantee there's no existing: this is normally already checked + * by ring creation above */ + TAILQ_FOREACH(te, hash_list, next) { + h = (struct rte_hash *) te->data; + if (strncmp(params->name, h->name, RTE_HASH_NAMESIZE) == 0) + break; + } + h = NULL; + if (te != NULL) { + rte_errno = EEXIST; + te = NULL; + goto err_unlock; + } + + te = rte_zmalloc("HASH_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, HASH, "tailq entry allocation failed\n"); + goto err_unlock; + } + + h = (struct rte_hash *)rte_zmalloc_socket(hash_name, sizeof(struct rte_hash), + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (h == NULL) { + RTE_LOG(ERR, HASH, "memory allocation failed\n"); + goto err_unlock; + } + + const uint32_t num_buckets = rte_align32pow2(params->entries) + / RTE_HASH_BUCKET_ENTRIES; + + buckets = rte_zmalloc_socket(NULL, + num_buckets * sizeof(struct rte_hash_bucket), + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (buckets == NULL) { + RTE_LOG(ERR, HASH, "memory allocation failed\n"); + goto err_unlock; + } + + const uint32_t key_entry_size = sizeof(struct rte_hash_key) + params->key_len; + const uint64_t key_tbl_size = (uint64_t) key_entry_size * num_key_slots; + + k = rte_zmalloc_socket(NULL, key_tbl_size, + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (k == NULL) { + RTE_LOG(ERR, HASH, "memory allocation failed\n"); + goto err_unlock; + } + +/* + * If x86 architecture is used, select appropriate compare function, + * which may use x86 intrinsics, otherwise use memcmp + */ +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) + /* Select function to compare keys */ + switch (params->key_len) { + case 16: + h->cmp_jump_table_idx = KEY_16_BYTES; + break; + case 32: + h->cmp_jump_table_idx = KEY_32_BYTES; + break; + case 48: + h->cmp_jump_table_idx = KEY_48_BYTES; + break; + case 64: + h->cmp_jump_table_idx = KEY_64_BYTES; + break; + case 80: + h->cmp_jump_table_idx = KEY_80_BYTES; + break; + case 96: + h->cmp_jump_table_idx = KEY_96_BYTES; + break; + case 112: + h->cmp_jump_table_idx = KEY_112_BYTES; + break; + case 128: + h->cmp_jump_table_idx = KEY_128_BYTES; + break; + default: + /* If key is not multiple of 16, use generic memcmp */ + h->cmp_jump_table_idx = KEY_OTHER_BYTES; + } +#else + h->cmp_jump_table_idx = KEY_OTHER_BYTES; +#endif + + if (multi_writer_support) { + h->local_free_slots = rte_zmalloc_socket(NULL, + sizeof(struct lcore_cache) * RTE_MAX_LCORE, + RTE_CACHE_LINE_SIZE, params->socket_id); + } + + /* Default hash function */ +#if defined(RTE_ARCH_X86) + default_hash_func = (rte_hash_function)rte_hash_crc; +#elif defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_CRC32)) + default_hash_func = (rte_hash_function)rte_hash_crc; +#endif + /* Setup hash context */ + snprintf(h->name, sizeof(h->name), "%s", params->name); + h->entries = params->entries; + h->key_len = params->key_len; + h->key_entry_size = key_entry_size; + h->hash_func_init_val = params->hash_func_init_val; + + h->num_buckets = num_buckets; + h->bucket_bitmask = h->num_buckets - 1; + h->buckets = buckets; + h->hash_func = (params->hash_func == NULL) ? + default_hash_func : params->hash_func; + h->key_store = k; + h->free_slots = r; + h->hw_trans_mem_support = hw_trans_mem_support; + h->multi_writer_support = multi_writer_support; + h->readwrite_concur_support = readwrite_concur_support; + +#if defined(RTE_ARCH_X86) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) + h->sig_cmp_fn = RTE_HASH_COMPARE_AVX2; + else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) + h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; + else +#endif + h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; + + /* Turn on multi-writer only with explicit flag from user and TM + * support. + */ + if (h->multi_writer_support) { + h->readwrite_lock = rte_malloc(NULL, sizeof(rte_rwlock_t), + RTE_CACHE_LINE_SIZE); + if (h->readwrite_lock == NULL) + goto err_unlock; + + rte_rwlock_init(h->readwrite_lock); + } + + /* Populate free slots ring. Entry zero is reserved for key misses. */ + for (i = 1; i < num_key_slots; i++) + rte_ring_sp_enqueue(r, (void *)((uintptr_t) i)); + + te->data = (void *) h; + TAILQ_INSERT_TAIL(hash_list, te, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return h; +err_unlock: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); +err: + rte_ring_free(r); + rte_free(te); + rte_free(h); + rte_free(buckets); + rte_free(k); + return NULL; +} + +void +rte_hash_free(struct rte_hash *h) +{ + struct rte_tailq_entry *te; + struct rte_hash_list *hash_list; + + if (h == NULL) + return; + + hash_list = RTE_TAILQ_CAST(rte_hash_tailq.head, rte_hash_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find out tailq entry */ + TAILQ_FOREACH(te, hash_list, next) { + if (te->data == (void *) h) + break; + } + + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(hash_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (h->multi_writer_support) { + rte_free(h->local_free_slots); + rte_free(h->readwrite_lock); + } + rte_ring_free(h->free_slots); + rte_free(h->key_store); + rte_free(h->buckets); + rte_free(h); + rte_free(te); +} + +hash_sig_t +rte_hash_hash(const struct rte_hash *h, const void *key) +{ + /* calc hash result by key */ + return h->hash_func(key, h->key_len, h->hash_func_init_val); +} + +/* Calc the secondary hash value from the primary hash value of a given key */ +static inline hash_sig_t +rte_hash_secondary_hash(const hash_sig_t primary_hash) +{ + static const unsigned all_bits_shift = 12; + static const unsigned alt_bits_xor = 0x5bd1e995; + + uint32_t tag = primary_hash >> all_bits_shift; + + return primary_hash ^ ((tag + 1) * alt_bits_xor); +} + +int32_t +rte_hash_count(const struct rte_hash *h) +{ + uint32_t tot_ring_cnt, cached_cnt = 0; + uint32_t i, ret; + + if (h == NULL) + return -EINVAL; + + if (h->multi_writer_support) { + tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * + (LCORE_CACHE_SIZE - 1); + for (i = 0; i < RTE_MAX_LCORE; i++) + cached_cnt += h->local_free_slots[i].len; + + ret = tot_ring_cnt - rte_ring_count(h->free_slots) - + cached_cnt; + } else { + tot_ring_cnt = h->entries; + ret = tot_ring_cnt - rte_ring_count(h->free_slots); + } + return ret; +} + +/* Read write locks implemented using rte_rwlock */ +static inline void +__hash_rw_writer_lock(const struct rte_hash *h) +{ + if (h->multi_writer_support && h->hw_trans_mem_support) + rte_rwlock_write_lock_tm(h->readwrite_lock); + else if (h->multi_writer_support) + rte_rwlock_write_lock(h->readwrite_lock); +} + + +static inline void +__hash_rw_reader_lock(const struct rte_hash *h) +{ + if (h->readwrite_concur_support && h->hw_trans_mem_support) + rte_rwlock_read_lock_tm(h->readwrite_lock); + else if (h->readwrite_concur_support) + rte_rwlock_read_lock(h->readwrite_lock); +} + +static inline void +__hash_rw_writer_unlock(const struct rte_hash *h) +{ + if (h->multi_writer_support && h->hw_trans_mem_support) + rte_rwlock_write_unlock_tm(h->readwrite_lock); + else if (h->multi_writer_support) + rte_rwlock_write_unlock(h->readwrite_lock); +} + +static inline void +__hash_rw_reader_unlock(const struct rte_hash *h) +{ + if (h->readwrite_concur_support && h->hw_trans_mem_support) + rte_rwlock_read_unlock_tm(h->readwrite_lock); + else if (h->readwrite_concur_support) + rte_rwlock_read_unlock(h->readwrite_lock); +} + +void +rte_hash_reset(struct rte_hash *h) +{ + void *ptr; + uint32_t tot_ring_cnt, i; + + if (h == NULL) + return; + + __hash_rw_writer_lock(h); + memset(h->buckets, 0, h->num_buckets * sizeof(struct rte_hash_bucket)); + memset(h->key_store, 0, h->key_entry_size * (h->entries + 1)); + + /* clear the free ring */ + while (rte_ring_dequeue(h->free_slots, &ptr) == 0) + rte_pause(); + + /* Repopulate the free slots ring. Entry zero is reserved for key misses */ + if (h->multi_writer_support) + tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * + (LCORE_CACHE_SIZE - 1); + else + tot_ring_cnt = h->entries; + + for (i = 1; i < tot_ring_cnt + 1; i++) + rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i)); + + if (h->multi_writer_support) { + /* Reset local caches per lcore */ + for (i = 0; i < RTE_MAX_LCORE; i++) + h->local_free_slots[i].len = 0; + } + __hash_rw_writer_unlock(h); +} + +/* + * Function called to enqueue back an index in the cache/ring, + * as slot has not being used and it can be used in the + * next addition attempt. + */ +static inline void +enqueue_slot_back(const struct rte_hash *h, + struct lcore_cache *cached_free_slots, + void *slot_id) +{ + if (h->multi_writer_support) { + cached_free_slots->objs[cached_free_slots->len] = slot_id; + cached_free_slots->len++; + } else + rte_ring_sp_enqueue(h->free_slots, slot_id); +} + +/* Search a key from bucket and update its data */ +static inline int32_t +search_and_update(const struct rte_hash *h, void *data, const void *key, + struct rte_hash_bucket *bkt, hash_sig_t sig, hash_sig_t alt_hash) +{ + int i; + struct rte_hash_key *k, *keys = h->key_store; + + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (bkt->sig_current[i] == sig && + bkt->sig_alt[i] == alt_hash) { + k = (struct rte_hash_key *) ((char *)keys + + bkt->key_idx[i] * h->key_entry_size); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { + /* Update data */ + k->pdata = data; + /* + * Return index where key is stored, + * subtracting the first dummy index + */ + return bkt->key_idx[i] - 1; + } + } + } + return -1; +} + +/* Only tries to insert at one bucket (@prim_bkt) without trying to push + * buckets around. + * return 1 if matching existing key, return 0 if succeeds, return -1 for no + * empty entry. + */ +static inline int32_t +rte_hash_cuckoo_insert_mw(const struct rte_hash *h, + struct rte_hash_bucket *prim_bkt, + struct rte_hash_bucket *sec_bkt, + const struct rte_hash_key *key, void *data, + hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + int32_t *ret_val) +{ + unsigned int i; + struct rte_hash_bucket *cur_bkt = prim_bkt; + int32_t ret; + + __hash_rw_writer_lock(h); + /* Check if key was inserted after last check but before this + * protected region in case of inserting duplicated keys. + */ + ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + + /* Insert new entry if there is room in the primary + * bucket. + */ + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + /* Check if slot is available */ + if (likely(prim_bkt->key_idx[i] == EMPTY_SLOT)) { + prim_bkt->sig_current[i] = sig; + prim_bkt->sig_alt[i] = alt_hash; + prim_bkt->key_idx[i] = new_idx; + break; + } + } + __hash_rw_writer_unlock(h); + + if (i != RTE_HASH_BUCKET_ENTRIES) + return 0; + + /* no empty entry */ + return -1; +} + +/* Shift buckets along provided cuckoo_path (@leaf and @leaf_slot) and fill + * the path head with new entry (sig, alt_hash, new_idx) + * return 1 if matched key found, return -1 if cuckoo path invalided and fail, + * return 0 if succeeds. + */ +static inline int +rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h, + struct rte_hash_bucket *bkt, + struct rte_hash_bucket *alt_bkt, + const struct rte_hash_key *key, void *data, + struct queue_node *leaf, uint32_t leaf_slot, + hash_sig_t sig, hash_sig_t alt_hash, uint32_t new_idx, + int32_t *ret_val) +{ + uint32_t prev_alt_bkt_idx; + struct rte_hash_bucket *cur_bkt = bkt; + struct queue_node *prev_node, *curr_node = leaf; + struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt; + uint32_t prev_slot, curr_slot = leaf_slot; + int32_t ret; + + __hash_rw_writer_lock(h); + + /* In case empty slot was gone before entering protected region */ + if (curr_bkt->key_idx[curr_slot] != EMPTY_SLOT) { + __hash_rw_writer_unlock(h); + return -1; + } + + /* Check if key was inserted after last check but before this + * protected region. + */ + ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + + ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + *ret_val = ret; + return 1; + } + + while (likely(curr_node->prev != NULL)) { + prev_node = curr_node->prev; + prev_bkt = prev_node->bkt; + prev_slot = curr_node->prev_slot; + + prev_alt_bkt_idx = + prev_bkt->sig_alt[prev_slot] & h->bucket_bitmask; + + if (unlikely(&h->buckets[prev_alt_bkt_idx] + != curr_bkt)) { + /* revert it to empty, otherwise duplicated keys */ + curr_bkt->key_idx[curr_slot] = EMPTY_SLOT; + __hash_rw_writer_unlock(h); + return -1; + } + + /* Need to swap current/alt sig to allow later + * Cuckoo insert to move elements back to its + * primary bucket if available + */ + curr_bkt->sig_alt[curr_slot] = + prev_bkt->sig_current[prev_slot]; + curr_bkt->sig_current[curr_slot] = + prev_bkt->sig_alt[prev_slot]; + curr_bkt->key_idx[curr_slot] = + prev_bkt->key_idx[prev_slot]; + + curr_slot = prev_slot; + curr_node = prev_node; + curr_bkt = curr_node->bkt; + } + + curr_bkt->sig_current[curr_slot] = sig; + curr_bkt->sig_alt[curr_slot] = alt_hash; + curr_bkt->key_idx[curr_slot] = new_idx; + + __hash_rw_writer_unlock(h); + + return 0; + +} + +/* + * Make space for new key, using bfs Cuckoo Search and Multi-Writer safe + * Cuckoo + */ +static inline int +rte_hash_cuckoo_make_space_mw(const struct rte_hash *h, + struct rte_hash_bucket *bkt, + struct rte_hash_bucket *sec_bkt, + const struct rte_hash_key *key, void *data, + hash_sig_t sig, hash_sig_t alt_hash, + uint32_t new_idx, int32_t *ret_val) +{ + unsigned int i; + struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN]; + struct queue_node *tail, *head; + struct rte_hash_bucket *curr_bkt, *alt_bkt; + + tail = queue; + head = queue + 1; + tail->bkt = bkt; + tail->prev = NULL; + tail->prev_slot = -1; + + /* Cuckoo bfs Search */ + while (likely(tail != head && head < + queue + RTE_HASH_BFS_QUEUE_MAX_LEN - + RTE_HASH_BUCKET_ENTRIES)) { + curr_bkt = tail->bkt; + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (curr_bkt->key_idx[i] == EMPTY_SLOT) { + int32_t ret = rte_hash_cuckoo_move_insert_mw(h, + bkt, sec_bkt, key, data, + tail, i, sig, alt_hash, + new_idx, ret_val); + if (likely(ret != -1)) + return ret; + } + + /* Enqueue new node and keep prev node info */ + alt_bkt = &(h->buckets[curr_bkt->sig_alt[i] + & h->bucket_bitmask]); + head->bkt = alt_bkt; + head->prev = tail; + head->prev_slot = i; + head++; + } + tail++; + } + + return -ENOSPC; +} + +static inline int32_t +__rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, + hash_sig_t sig, void *data) +{ + hash_sig_t alt_hash; + uint32_t prim_bucket_idx, sec_bucket_idx; + struct rte_hash_bucket *prim_bkt, *sec_bkt; + struct rte_hash_key *new_k, *keys = h->key_store; + void *slot_id = NULL; + uint32_t new_idx; + int ret; + unsigned n_slots; + unsigned lcore_id; + struct lcore_cache *cached_free_slots = NULL; + int32_t ret_val; + + prim_bucket_idx = sig & h->bucket_bitmask; + prim_bkt = &h->buckets[prim_bucket_idx]; + rte_prefetch0(prim_bkt); + + alt_hash = rte_hash_secondary_hash(sig); + sec_bucket_idx = alt_hash & h->bucket_bitmask; + sec_bkt = &h->buckets[sec_bucket_idx]; + rte_prefetch0(sec_bkt); + + /* Check if key is already inserted in primary location */ + __hash_rw_writer_lock(h); + ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } + + /* Check if key is already inserted in secondary location */ + ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } + __hash_rw_writer_unlock(h); + + /* Did not find a match, so get a new slot for storing the new key */ + if (h->multi_writer_support) { + lcore_id = rte_lcore_id(); + cached_free_slots = &h->local_free_slots[lcore_id]; + /* Try to get a free slot from the local cache */ + if (cached_free_slots->len == 0) { + /* Need to get another burst of free slots from global ring */ + n_slots = rte_ring_mc_dequeue_burst(h->free_slots, + cached_free_slots->objs, + LCORE_CACHE_SIZE, NULL); + if (n_slots == 0) { + return -ENOSPC; + } + + cached_free_slots->len += n_slots; + } + + /* Get a free slot from the local cache */ + cached_free_slots->len--; + slot_id = cached_free_slots->objs[cached_free_slots->len]; + } else { + if (rte_ring_sc_dequeue(h->free_slots, &slot_id) != 0) { + return -ENOSPC; + } + } + + new_k = RTE_PTR_ADD(keys, (uintptr_t)slot_id * h->key_entry_size); + new_idx = (uint32_t)((uintptr_t) slot_id); + /* Copy key */ + rte_memcpy(new_k->key, key, h->key_len); + new_k->pdata = data; + + + /* Find an empty slot and insert */ + ret = rte_hash_cuckoo_insert_mw(h, prim_bkt, sec_bkt, key, data, + sig, alt_hash, new_idx, &ret_val); + if (ret == 0) + return new_idx - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret_val; + } + + /* Primary bucket full, need to make space for new entry */ + ret = rte_hash_cuckoo_make_space_mw(h, prim_bkt, sec_bkt, key, data, + sig, alt_hash, new_idx, &ret_val); + if (ret == 0) + return new_idx - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret_val; + } + + /* Also search secondary bucket to get better occupancy */ + ret = rte_hash_cuckoo_make_space_mw(h, sec_bkt, prim_bkt, key, data, + alt_hash, sig, new_idx, &ret_val); + + if (ret == 0) + return new_idx - 1; + else if (ret == 1) { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret_val; + } else { + enqueue_slot_back(h, cached_free_slots, slot_id); + return ret; + } +} + +int32_t +rte_hash_add_key_with_hash(const struct rte_hash *h, + const void *key, hash_sig_t sig) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_add_key_with_hash(h, key, sig, 0); +} + +int32_t +rte_hash_add_key(const struct rte_hash *h, const void *key) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_add_key_with_hash(h, key, rte_hash_hash(h, key), 0); +} + +int +rte_hash_add_key_with_hash_data(const struct rte_hash *h, + const void *key, hash_sig_t sig, void *data) +{ + int ret; + + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + ret = __rte_hash_add_key_with_hash(h, key, sig, data); + if (ret >= 0) + return 0; + else + return ret; +} + +int +rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data) +{ + int ret; + + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + + ret = __rte_hash_add_key_with_hash(h, key, rte_hash_hash(h, key), data); + if (ret >= 0) + return 0; + else + return ret; +} + +/* Search one bucket to find the match key */ +static inline int32_t +search_one_bucket(const struct rte_hash *h, const void *key, hash_sig_t sig, + void **data, const struct rte_hash_bucket *bkt) +{ + int i; + struct rte_hash_key *k, *keys = h->key_store; + + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (bkt->sig_current[i] == sig && + bkt->key_idx[i] != EMPTY_SLOT) { + k = (struct rte_hash_key *) ((char *)keys + + bkt->key_idx[i] * h->key_entry_size); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { + if (data != NULL) + *data = k->pdata; + /* + * Return index where key is stored, + * subtracting the first dummy index + */ + return bkt->key_idx[i] - 1; + } + } + } + return -1; +} + +static inline int32_t +__rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key, + hash_sig_t sig, void **data) +{ + uint32_t bucket_idx; + hash_sig_t alt_hash; + struct rte_hash_bucket *bkt; + int ret; + + bucket_idx = sig & h->bucket_bitmask; + bkt = &h->buckets[bucket_idx]; + + __hash_rw_reader_lock(h); + + /* Check if key is in primary location */ + ret = search_one_bucket(h, key, sig, data, bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } + /* Calculate secondary hash */ + alt_hash = rte_hash_secondary_hash(sig); + bucket_idx = alt_hash & h->bucket_bitmask; + bkt = &h->buckets[bucket_idx]; + + /* Check if key is in secondary location */ + ret = search_one_bucket(h, key, alt_hash, data, bkt); + if (ret != -1) { + __hash_rw_reader_unlock(h); + return ret; + } + __hash_rw_reader_unlock(h); + return -ENOENT; +} + +int32_t +rte_hash_lookup_with_hash(const struct rte_hash *h, + const void *key, hash_sig_t sig) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_lookup_with_hash(h, key, sig, NULL); +} + +int32_t +rte_hash_lookup(const struct rte_hash *h, const void *key) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_lookup_with_hash(h, key, rte_hash_hash(h, key), NULL); +} + +int +rte_hash_lookup_with_hash_data(const struct rte_hash *h, + const void *key, hash_sig_t sig, void **data) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_lookup_with_hash(h, key, sig, data); +} + +int +rte_hash_lookup_data(const struct rte_hash *h, const void *key, void **data) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_lookup_with_hash(h, key, rte_hash_hash(h, key), data); +} + +static inline void +remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i) +{ + unsigned lcore_id, n_slots; + struct lcore_cache *cached_free_slots; + + bkt->sig_current[i] = NULL_SIGNATURE; + bkt->sig_alt[i] = NULL_SIGNATURE; + if (h->multi_writer_support) { + lcore_id = rte_lcore_id(); + cached_free_slots = &h->local_free_slots[lcore_id]; + /* Cache full, need to free it. */ + if (cached_free_slots->len == LCORE_CACHE_SIZE) { + /* Need to enqueue the free slots in global ring. */ + n_slots = rte_ring_mp_enqueue_burst(h->free_slots, + cached_free_slots->objs, + LCORE_CACHE_SIZE, NULL); + cached_free_slots->len -= n_slots; + } + /* Put index of new free slot in cache. */ + cached_free_slots->objs[cached_free_slots->len] = + (void *)((uintptr_t)bkt->key_idx[i]); + cached_free_slots->len++; + } else { + rte_ring_sp_enqueue(h->free_slots, + (void *)((uintptr_t)bkt->key_idx[i])); + } +} + +/* Search one bucket and remove the matched key */ +static inline int32_t +search_and_remove(const struct rte_hash *h, const void *key, + struct rte_hash_bucket *bkt, hash_sig_t sig) +{ + struct rte_hash_key *k, *keys = h->key_store; + unsigned int i; + int32_t ret; + + /* Check if key is in primary location */ + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + if (bkt->sig_current[i] == sig && + bkt->key_idx[i] != EMPTY_SLOT) { + k = (struct rte_hash_key *) ((char *)keys + + bkt->key_idx[i] * h->key_entry_size); + if (rte_hash_cmp_eq(key, k->key, h) == 0) { + remove_entry(h, bkt, i); + + /* + * Return index where key is stored, + * subtracting the first dummy index + */ + ret = bkt->key_idx[i] - 1; + bkt->key_idx[i] = EMPTY_SLOT; + return ret; + } + } + } + return -1; +} + +static inline int32_t +__rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, + hash_sig_t sig) +{ + uint32_t bucket_idx; + hash_sig_t alt_hash; + struct rte_hash_bucket *bkt; + int32_t ret; + + bucket_idx = sig & h->bucket_bitmask; + bkt = &h->buckets[bucket_idx]; + + __hash_rw_writer_lock(h); + /* look for key in primary bucket */ + ret = search_and_remove(h, key, bkt, sig); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } + + /* Calculate secondary hash */ + alt_hash = rte_hash_secondary_hash(sig); + bucket_idx = alt_hash & h->bucket_bitmask; + bkt = &h->buckets[bucket_idx]; + + /* look for key in secondary bucket */ + ret = search_and_remove(h, key, bkt, alt_hash); + if (ret != -1) { + __hash_rw_writer_unlock(h); + return ret; + } + + __hash_rw_writer_unlock(h); + return -ENOENT; +} + +int32_t +rte_hash_del_key_with_hash(const struct rte_hash *h, + const void *key, hash_sig_t sig) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_del_key_with_hash(h, key, sig); +} + +int32_t +rte_hash_del_key(const struct rte_hash *h, const void *key) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + return __rte_hash_del_key_with_hash(h, key, rte_hash_hash(h, key)); +} + +int +rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, + void **key) +{ + RETURN_IF_TRUE(((h == NULL) || (key == NULL)), -EINVAL); + + struct rte_hash_key *k, *keys = h->key_store; + k = (struct rte_hash_key *) ((char *) keys + (position + 1) * + h->key_entry_size); + *key = k->key; + + if (position != + __rte_hash_lookup_with_hash(h, *key, rte_hash_hash(h, *key), + NULL)) { + return -ENOENT; + } + + return 0; +} + +static inline void +compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, + const struct rte_hash_bucket *prim_bkt, + const struct rte_hash_bucket *sec_bkt, + hash_sig_t prim_hash, hash_sig_t sec_hash, + enum rte_hash_sig_compare_function sig_cmp_fn) +{ + unsigned int i; + + switch (sig_cmp_fn) { +#ifdef RTE_MACHINE_CPUFLAG_AVX2 + case RTE_HASH_COMPARE_AVX2: + *prim_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32( + _mm256_load_si256( + (__m256i const *)prim_bkt->sig_current), + _mm256_set1_epi32(prim_hash))); + *sec_hash_matches = _mm256_movemask_ps((__m256)_mm256_cmpeq_epi32( + _mm256_load_si256( + (__m256i const *)sec_bkt->sig_current), + _mm256_set1_epi32(sec_hash))); + break; +#endif +#ifdef RTE_MACHINE_CPUFLAG_SSE2 + case RTE_HASH_COMPARE_SSE: + /* Compare the first 4 signatures in the bucket */ + *prim_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16( + _mm_load_si128( + (__m128i const *)prim_bkt->sig_current), + _mm_set1_epi32(prim_hash))); + *prim_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16( + _mm_load_si128( + (__m128i const *)&prim_bkt->sig_current[4]), + _mm_set1_epi32(prim_hash)))) << 4; + /* Compare the first 4 signatures in the bucket */ + *sec_hash_matches = _mm_movemask_ps((__m128)_mm_cmpeq_epi16( + _mm_load_si128( + (__m128i const *)sec_bkt->sig_current), + _mm_set1_epi32(sec_hash))); + *sec_hash_matches |= (_mm_movemask_ps((__m128)_mm_cmpeq_epi16( + _mm_load_si128( + (__m128i const *)&sec_bkt->sig_current[4]), + _mm_set1_epi32(sec_hash)))) << 4; + break; +#endif + default: + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + *prim_hash_matches |= + ((prim_hash == prim_bkt->sig_current[i]) << i); + *sec_hash_matches |= + ((sec_hash == sec_bkt->sig_current[i]) << i); + } + } + +} + +#define PREFETCH_OFFSET 4 +static inline void +__rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, + int32_t num_keys, int32_t *positions, + uint64_t *hit_mask, void *data[]) +{ + uint64_t hits = 0; + int32_t i; + uint32_t prim_hash[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t sec_hash[RTE_HASH_LOOKUP_BULK_MAX]; + const struct rte_hash_bucket *primary_bkt[RTE_HASH_LOOKUP_BULK_MAX]; + const struct rte_hash_bucket *secondary_bkt[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + + /* Prefetch first keys */ + for (i = 0; i < PREFETCH_OFFSET && i < num_keys; i++) + rte_prefetch0(keys[i]); + + /* + * Prefetch rest of the keys, calculate primary and + * secondary bucket and prefetch them + */ + for (i = 0; i < (num_keys - PREFETCH_OFFSET); i++) { + rte_prefetch0(keys[i + PREFETCH_OFFSET]); + + prim_hash[i] = rte_hash_hash(h, keys[i]); + sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]); + + primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask]; + secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask]; + + rte_prefetch0(primary_bkt[i]); + rte_prefetch0(secondary_bkt[i]); + } + + /* Calculate and prefetch rest of the buckets */ + for (; i < num_keys; i++) { + prim_hash[i] = rte_hash_hash(h, keys[i]); + sec_hash[i] = rte_hash_secondary_hash(prim_hash[i]); + + primary_bkt[i] = &h->buckets[prim_hash[i] & h->bucket_bitmask]; + secondary_bkt[i] = &h->buckets[sec_hash[i] & h->bucket_bitmask]; + + rte_prefetch0(primary_bkt[i]); + rte_prefetch0(secondary_bkt[i]); + } + + __hash_rw_reader_lock(h); + /* Compare signatures and prefetch key slot of first hit */ + for (i = 0; i < num_keys; i++) { + compare_signatures(&prim_hitmask[i], &sec_hitmask[i], + primary_bkt[i], secondary_bkt[i], + prim_hash[i], sec_hash[i], h->sig_cmp_fn); + + if (prim_hitmask[i]) { + uint32_t first_hit = __builtin_ctzl(prim_hitmask[i]); + uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + rte_prefetch0(key_slot); + continue; + } + + if (sec_hitmask[i]) { + uint32_t first_hit = __builtin_ctzl(sec_hitmask[i]); + uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + rte_prefetch0(key_slot); + } + } + + /* Compare keys, first hits in primary first */ + for (i = 0; i < num_keys; i++) { + positions[i] = -ENOENT; + while (prim_hitmask[i]) { + uint32_t hit_index = __builtin_ctzl(prim_hitmask[i]); + + uint32_t key_idx = primary_bkt[i]->key_idx[hit_index]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + /* + * If key index is 0, do not compare key, + * as it is checking the dummy slot + */ + if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) { + if (data != NULL) + data[i] = key_slot->pdata; + + hits |= 1ULL << i; + positions[i] = key_idx - 1; + goto next_key; + } + prim_hitmask[i] &= ~(1 << (hit_index)); + } + + while (sec_hitmask[i]) { + uint32_t hit_index = __builtin_ctzl(sec_hitmask[i]); + + uint32_t key_idx = secondary_bkt[i]->key_idx[hit_index]; + const struct rte_hash_key *key_slot = + (const struct rte_hash_key *)( + (const char *)h->key_store + + key_idx * h->key_entry_size); + /* + * If key index is 0, do not compare key, + * as it is checking the dummy slot + */ + + if (!!key_idx & !rte_hash_cmp_eq(key_slot->key, keys[i], h)) { + if (data != NULL) + data[i] = key_slot->pdata; + + hits |= 1ULL << i; + positions[i] = key_idx - 1; + goto next_key; + } + sec_hitmask[i] &= ~(1 << (hit_index)); + } + +next_key: + continue; + } + + __hash_rw_reader_unlock(h); + + if (hit_mask != NULL) + *hit_mask = hits; +} + +int +rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, + uint32_t num_keys, int32_t *positions) +{ + RETURN_IF_TRUE(((h == NULL) || (keys == NULL) || (num_keys == 0) || + (num_keys > RTE_HASH_LOOKUP_BULK_MAX) || + (positions == NULL)), -EINVAL); + + __rte_hash_lookup_bulk(h, keys, num_keys, positions, NULL, NULL); + return 0; +} + +int +rte_hash_lookup_bulk_data(const struct rte_hash *h, const void **keys, + uint32_t num_keys, uint64_t *hit_mask, void *data[]) +{ + RETURN_IF_TRUE(((h == NULL) || (keys == NULL) || (num_keys == 0) || + (num_keys > RTE_HASH_LOOKUP_BULK_MAX) || + (hit_mask == NULL)), -EINVAL); + + int32_t positions[num_keys]; + + __rte_hash_lookup_bulk(h, keys, num_keys, positions, hit_mask, data); + + /* Return number of hits */ + return __builtin_popcountl(*hit_mask); +} + +int32_t +rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32_t *next) +{ + uint32_t bucket_idx, idx, position; + struct rte_hash_key *next_key; + + RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL); + + const uint32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES; + /* Out of bounds */ + if (*next >= total_entries) + return -ENOENT; + + /* Calculate bucket and index of current iterator */ + bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; + idx = *next % RTE_HASH_BUCKET_ENTRIES; + + /* If current position is empty, go to the next one */ + while (h->buckets[bucket_idx].key_idx[idx] == EMPTY_SLOT) { + (*next)++; + /* End of table */ + if (*next == total_entries) + return -ENOENT; + bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; + idx = *next % RTE_HASH_BUCKET_ENTRIES; + } + __hash_rw_reader_lock(h); + /* Get position of entry in key table */ + position = h->buckets[bucket_idx].key_idx[idx]; + next_key = (struct rte_hash_key *) ((char *)h->key_store + + position * h->key_entry_size); + /* Return key and data */ + *key = next_key->key; + *data = next_key->pdata; + + __hash_rw_reader_unlock(h); + + /* Increment iterator */ + (*next)++; + + return position - 1; +} diff --git a/src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.h b/src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.h new file mode 100644 index 00000000..b43f467d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_cuckoo_hash.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Intel Corporation + */ + +/* rte_cuckoo_hash.h + * This file hold Cuckoo Hash private data structures to allows include from + * platform specific files like rte_cuckoo_hash_x86.h + */ + +#ifndef _RTE_CUCKOO_HASH_H_ +#define _RTE_CUCKOO_HASH_H_ + +#if defined(RTE_ARCH_X86) +#include "rte_cmp_x86.h" +#endif + +#if defined(RTE_ARCH_ARM64) +#include "rte_cmp_arm64.h" +#endif + +/* Macro to enable/disable run-time checking of function parameters */ +#if defined(RTE_LIBRTE_HASH_DEBUG) +#define RETURN_IF_TRUE(cond, retval) do { \ + if (cond) \ + return retval; \ +} while (0) +#else +#define RETURN_IF_TRUE(cond, retval) +#endif + +#include +#include + +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) +/* + * All different options to select a key compare function, + * based on the key size and custom function. + */ +enum cmp_jump_table_case { + KEY_CUSTOM = 0, + KEY_16_BYTES, + KEY_32_BYTES, + KEY_48_BYTES, + KEY_64_BYTES, + KEY_80_BYTES, + KEY_96_BYTES, + KEY_112_BYTES, + KEY_128_BYTES, + KEY_OTHER_BYTES, + NUM_KEY_CMP_CASES, +}; + +/* + * Table storing all different key compare functions + * (multi-process supported) + */ +const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { + NULL, + rte_hash_k16_cmp_eq, + rte_hash_k32_cmp_eq, + rte_hash_k48_cmp_eq, + rte_hash_k64_cmp_eq, + rte_hash_k80_cmp_eq, + rte_hash_k96_cmp_eq, + rte_hash_k112_cmp_eq, + rte_hash_k128_cmp_eq, + memcmp +}; +#else +/* + * All different options to select a key compare function, + * based on the key size and custom function. + */ +enum cmp_jump_table_case { + KEY_CUSTOM = 0, + KEY_OTHER_BYTES, + NUM_KEY_CMP_CASES, +}; + +/* + * Table storing all different key compare functions + * (multi-process supported) + */ +const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = { + NULL, + memcmp +}; + +#endif + + +/** Number of items per bucket. */ +#define RTE_HASH_BUCKET_ENTRIES 8 + +#if !RTE_IS_POWER_OF_2(RTE_HASH_BUCKET_ENTRIES) +#error RTE_HASH_BUCKET_ENTRIES must be a power of 2 +#endif + +#define NULL_SIGNATURE 0 + +#define EMPTY_SLOT 0 + +#define KEY_ALIGNMENT 16 + +#define LCORE_CACHE_SIZE 64 + +#define RTE_HASH_MAX_PUSHES 100 + +#define RTE_HASH_BFS_QUEUE_MAX_LEN 1000 + +#define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4 + +#define RTE_HASH_TSX_MAX_RETRY 10 + +struct lcore_cache { + unsigned len; /**< Cache len */ + void *objs[LCORE_CACHE_SIZE]; /**< Cache objects */ +} __rte_cache_aligned; + +/* Structure that stores key-value pair */ +struct rte_hash_key { + union { + uintptr_t idata; + void *pdata; + }; + /* Variable key size */ + char key[0]; +} __attribute__((aligned(KEY_ALIGNMENT))); + +/* All different signature compare functions */ +enum rte_hash_sig_compare_function { + RTE_HASH_COMPARE_SCALAR = 0, + RTE_HASH_COMPARE_SSE, + RTE_HASH_COMPARE_AVX2, + RTE_HASH_COMPARE_NUM +}; + +/** Bucket structure */ +struct rte_hash_bucket { + hash_sig_t sig_current[RTE_HASH_BUCKET_ENTRIES]; + + uint32_t key_idx[RTE_HASH_BUCKET_ENTRIES]; + + hash_sig_t sig_alt[RTE_HASH_BUCKET_ENTRIES]; + + uint8_t flag[RTE_HASH_BUCKET_ENTRIES]; +} __rte_cache_aligned; + +/** A hash table structure. */ +struct rte_hash { + char name[RTE_HASH_NAMESIZE]; /**< Name of the hash. */ + uint32_t entries; /**< Total table entries. */ + uint32_t num_buckets; /**< Number of buckets in table. */ + + struct rte_ring *free_slots; + /**< Ring that stores all indexes of the free slots in the key table */ + + struct lcore_cache *local_free_slots; + /**< Local cache per lcore, storing some indexes of the free slots */ + + /* Fields used in lookup */ + + uint32_t key_len __rte_cache_aligned; + /**< Length of hash key. */ + uint8_t hw_trans_mem_support; + /**< If hardware transactional memory is used. */ + uint8_t multi_writer_support; + /**< If multi-writer support is enabled. */ + uint8_t readwrite_concur_support; + /**< If read-write concurrency support is enabled */ + rte_hash_function hash_func; /**< Function used to calculate hash. */ + uint32_t hash_func_init_val; /**< Init value used by hash_func. */ + rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; + /**< Custom function used to compare keys. */ + enum cmp_jump_table_case cmp_jump_table_idx; + /**< Indicates which compare function to use. */ + enum rte_hash_sig_compare_function sig_cmp_fn; + /**< Indicates which signature compare function to use. */ + uint32_t bucket_bitmask; + /**< Bitmask for getting bucket index from hash signature. */ + uint32_t key_entry_size; /**< Size of each key entry. */ + + void *key_store; /**< Table storing all keys and data */ + struct rte_hash_bucket *buckets; + /**< Table with buckets storing all the hash values and key indexes + * to the key table. + */ + rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */ +} __rte_cache_aligned; + +struct queue_node { + struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */ + + struct queue_node *prev; /* Parent(bucket) in search path */ + int prev_slot; /* Parent(slot) in search path */ +}; + +#endif diff --git a/src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.c b/src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.c new file mode 100644 index 00000000..c9b470d7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.c @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_fbk_hash.h" + +TAILQ_HEAD(rte_fbk_hash_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_fbk_hash_tailq = { + .name = "RTE_FBK_HASH", +}; +EAL_REGISTER_TAILQ(rte_fbk_hash_tailq) + +/** + * Performs a lookup for an existing hash table, and returns a pointer to + * the table if found. + * + * @param name + * Name of the hash table to find + * + * @return + * pointer to hash table structure or NULL on error. + */ +struct rte_fbk_hash_table * +rte_fbk_hash_find_existing(const char *name) +{ + struct rte_fbk_hash_table *h = NULL; + struct rte_tailq_entry *te; + struct rte_fbk_hash_list *fbk_hash_list; + + fbk_hash_list = RTE_TAILQ_CAST(rte_fbk_hash_tailq.head, + rte_fbk_hash_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, fbk_hash_list, next) { + h = (struct rte_fbk_hash_table *) te->data; + if (strncmp(name, h->name, RTE_FBK_HASH_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + return h; +} + +/** + * Create a new hash table for use with four byte keys. + * + * @param params + * Parameters used in creation of hash table. + * + * @return + * Pointer to hash table structure that is used in future hash table + * operations, or NULL on error. + */ +struct rte_fbk_hash_table * +rte_fbk_hash_create(const struct rte_fbk_hash_params *params) +{ + struct rte_fbk_hash_table *ht = NULL; + struct rte_tailq_entry *te; + char hash_name[RTE_FBK_HASH_NAMESIZE]; + const uint32_t mem_size = + sizeof(*ht) + (sizeof(ht->t[0]) * params->entries); + uint32_t i; + struct rte_fbk_hash_list *fbk_hash_list; + rte_fbk_hash_fn default_hash_func = (rte_fbk_hash_fn)rte_jhash_1word; + + fbk_hash_list = RTE_TAILQ_CAST(rte_fbk_hash_tailq.head, + rte_fbk_hash_list); + + /* Error checking of parameters. */ + if ((!rte_is_power_of_2(params->entries)) || + (!rte_is_power_of_2(params->entries_per_bucket)) || + (params->entries == 0) || + (params->entries_per_bucket == 0) || + (params->entries_per_bucket > params->entries) || + (params->entries > RTE_FBK_HASH_ENTRIES_MAX) || + (params->entries_per_bucket > RTE_FBK_HASH_ENTRIES_PER_BUCKET_MAX)){ + rte_errno = EINVAL; + return NULL; + } + + snprintf(hash_name, sizeof(hash_name), "FBK_%s", params->name); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, fbk_hash_list, next) { + ht = (struct rte_fbk_hash_table *) te->data; + if (strncmp(params->name, ht->name, RTE_FBK_HASH_NAMESIZE) == 0) + break; + } + ht = NULL; + if (te != NULL) { + rte_errno = EEXIST; + goto exit; + } + + te = rte_zmalloc("FBK_HASH_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, HASH, "Failed to allocate tailq entry\n"); + goto exit; + } + + /* Allocate memory for table. */ + ht = rte_zmalloc_socket(hash_name, mem_size, + 0, params->socket_id); + if (ht == NULL) { + RTE_LOG(ERR, HASH, "Failed to allocate fbk hash table\n"); + rte_free(te); + goto exit; + } + + /* Default hash function */ +#if defined(RTE_ARCH_X86) + default_hash_func = (rte_fbk_hash_fn)rte_hash_crc_4byte; +#elif defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_CRC32)) + default_hash_func = (rte_fbk_hash_fn)rte_hash_crc_4byte; +#endif + + /* Set up hash table context. */ + snprintf(ht->name, sizeof(ht->name), "%s", params->name); + ht->entries = params->entries; + ht->entries_per_bucket = params->entries_per_bucket; + ht->used_entries = 0; + ht->bucket_mask = (params->entries / params->entries_per_bucket) - 1; + for (ht->bucket_shift = 0, i = 1; + (params->entries_per_bucket & i) == 0; + ht->bucket_shift++, i <<= 1) + ; /* empty loop body */ + + if (params->hash_func != NULL) { + ht->hash_func = params->hash_func; + ht->init_val = params->init_val; + } + else { + ht->hash_func = default_hash_func; + ht->init_val = RTE_FBK_HASH_INIT_VAL_DEFAULT; + } + + te->data = (void *) ht; + + TAILQ_INSERT_TAIL(fbk_hash_list, te, next); + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return ht; +} + +/** + * Free all memory used by a hash table. + * + * @param ht + * Hash table to deallocate. + */ +void +rte_fbk_hash_free(struct rte_fbk_hash_table *ht) +{ + struct rte_tailq_entry *te; + struct rte_fbk_hash_list *fbk_hash_list; + + if (ht == NULL) + return; + + fbk_hash_list = RTE_TAILQ_CAST(rte_fbk_hash_tailq.head, + rte_fbk_hash_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find out tailq entry */ + TAILQ_FOREACH(te, fbk_hash_list, next) { + if (te->data == (void *) ht) + break; + } + + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(fbk_hash_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(ht); + rte_free(te); +} diff --git a/src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.h b/src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.h new file mode 100644 index 00000000..c4d6976d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_fbk_hash.h @@ -0,0 +1,360 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_FBK_HASH_H_ +#define _RTE_FBK_HASH_H_ + +/** + * @file + * + * This is a hash table implementation for four byte keys (fbk). + * + * Note that the return value of the add function should always be checked as, + * if a bucket is full, the key is not added even if there is space in other + * buckets. This keeps the lookup function very simple and therefore fast. + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include + +#ifndef RTE_FBK_HASH_INIT_VAL_DEFAULT +/** Initialising value used when calculating hash. */ +#define RTE_FBK_HASH_INIT_VAL_DEFAULT 0xFFFFFFFF +#endif + +/** The maximum number of entries in the hash table that is supported. */ +#define RTE_FBK_HASH_ENTRIES_MAX (1 << 20) + +/** The maximum number of entries in each bucket that is supported. */ +#define RTE_FBK_HASH_ENTRIES_PER_BUCKET_MAX 256 + +/** Maximum size of string for naming the hash. */ +#define RTE_FBK_HASH_NAMESIZE 32 + +/** Type of function that can be used for calculating the hash value. */ +typedef uint32_t (*rte_fbk_hash_fn)(uint32_t key, uint32_t init_val); + +/** Parameters used when creating four-byte key hash table. */ +struct rte_fbk_hash_params { + const char *name; /**< Name of the hash table. */ + uint32_t entries; /**< Total number of entries. */ + uint32_t entries_per_bucket; /**< Number of entries in a bucket. */ + int socket_id; /**< Socket to allocate memory on. */ + rte_fbk_hash_fn hash_func; /**< The hash function. */ + uint32_t init_val; /**< For initialising hash function. */ +}; + +/** Individual entry in the four-byte key hash table. */ +union rte_fbk_hash_entry { + uint64_t whole_entry; /**< For accessing entire entry. */ + struct { + uint16_t is_entry; /**< Non-zero if entry is active. */ + uint16_t value; /**< Value returned by lookup. */ + uint32_t key; /**< Key used to find value. */ + } entry; /**< For accessing each entry part. */ +}; + + +/** The four-byte key hash table structure. */ +struct rte_fbk_hash_table { + char name[RTE_FBK_HASH_NAMESIZE]; /**< Name of the hash. */ + uint32_t entries; /**< Total number of entries. */ + uint32_t entries_per_bucket; /**< Number of entries in a bucket. */ + uint32_t used_entries; /**< How many entries are used. */ + uint32_t bucket_mask; /**< To find which bucket the key is in. */ + uint32_t bucket_shift; /**< Convert bucket to table offset. */ + rte_fbk_hash_fn hash_func; /**< The hash function. */ + uint32_t init_val; /**< For initialising hash function. */ + + /** A flat table of all buckets. */ + union rte_fbk_hash_entry t[]; +}; + +/** + * Find the offset into hash table of the bucket containing a particular key. + * + * @param ht + * Pointer to hash table. + * @param key + * Key to calculate bucket for. + * @return + * Offset into hash table. + */ +static inline uint32_t +rte_fbk_hash_get_bucket(const struct rte_fbk_hash_table *ht, uint32_t key) +{ + return (ht->hash_func(key, ht->init_val) & ht->bucket_mask) << + ht->bucket_shift; +} + +/** + * Add a key to an existing hash table with bucket id. + * This operation is not multi-thread safe + * and should only be called from one thread. + * + * @param ht + * Hash table to add the key to. + * @param key + * Key to add to the hash table. + * @param value + * Value to associate with key. + * @param bucket + * Bucket to associate with key. + * @return + * 0 if ok, or negative value on error. + */ +static inline int +rte_fbk_hash_add_key_with_bucket(struct rte_fbk_hash_table *ht, + uint32_t key, uint16_t value, uint32_t bucket) +{ + /* + * The writing of a new value to the hash table is done as a single + * 64bit operation. This should help prevent individual entries being + * corrupted due to race conditions, but it's still possible to + * overwrite entries that have just been made valid. + */ + const uint64_t new_entry = ((uint64_t)(key) << 32) | + ((uint64_t)(value) << 16) | + 1; /* 1 = is_entry bit. */ + uint32_t i; + + for (i = 0; i < ht->entries_per_bucket; i++) { + /* Set entry if unused. */ + if (! ht->t[bucket + i].entry.is_entry) { + ht->t[bucket + i].whole_entry = new_entry; + ht->used_entries++; + return 0; + } + /* Change value if key already exists. */ + if (ht->t[bucket + i].entry.key == key) { + ht->t[bucket + i].entry.value = value; + return 0; + } + } + + return -ENOSPC; /* No space in bucket. */ +} + +/** + * Add a key to an existing hash table. This operation is not multi-thread safe + * and should only be called from one thread. + * + * @param ht + * Hash table to add the key to. + * @param key + * Key to add to the hash table. + * @param value + * Value to associate with key. + * @return + * 0 if ok, or negative value on error. + */ +static inline int +rte_fbk_hash_add_key(struct rte_fbk_hash_table *ht, + uint32_t key, uint16_t value) +{ + return rte_fbk_hash_add_key_with_bucket(ht, + key, value, rte_fbk_hash_get_bucket(ht, key)); +} + +/** + * Remove a key with a given bucket id from an existing hash table. + * This operation is not multi-thread + * safe and should only be called from one thread. + * + * @param ht + * Hash table to remove the key from. + * @param key + * Key to remove from the hash table. + * @param bucket + * Bucket id associate with key. + * @return + * 0 if ok, or negative value on error. + */ +static inline int +rte_fbk_hash_delete_key_with_bucket(struct rte_fbk_hash_table *ht, + uint32_t key, uint32_t bucket) +{ + uint32_t last_entry = ht->entries_per_bucket - 1; + uint32_t i, j; + + for (i = 0; i < ht->entries_per_bucket; i++) { + if (ht->t[bucket + i].entry.key == key) { + /* Find last key in bucket. */ + for (j = ht->entries_per_bucket - 1; j > i; j-- ) { + if (! ht->t[bucket + j].entry.is_entry) { + last_entry = j - 1; + } + } + /* + * Move the last key to the deleted key's position, and + * delete the last key. lastEntry and i may be same but + * it doesn't matter. + */ + ht->t[bucket + i].whole_entry = + ht->t[bucket + last_entry].whole_entry; + ht->t[bucket + last_entry].whole_entry = 0; + + ht->used_entries--; + return 0; + } + } + + return -ENOENT; /* Key didn't exist. */ +} + +/** + * Remove a key from an existing hash table. This operation is not multi-thread + * safe and should only be called from one thread. + * + * @param ht + * Hash table to remove the key from. + * @param key + * Key to remove from the hash table. + * @return + * 0 if ok, or negative value on error. + */ +static inline int +rte_fbk_hash_delete_key(struct rte_fbk_hash_table *ht, uint32_t key) +{ + return rte_fbk_hash_delete_key_with_bucket(ht, + key, rte_fbk_hash_get_bucket(ht, key)); +} + +/** + * Find a key in the hash table with a given bucketid. + * This operation is multi-thread safe. + * + * @param ht + * Hash table to look in. + * @param key + * Key to find. + * @param bucket + * Bucket associate to the key. + * @return + * The value that was associated with the key, or negative value on error. + */ +static inline int +rte_fbk_hash_lookup_with_bucket(const struct rte_fbk_hash_table *ht, + uint32_t key, uint32_t bucket) +{ + union rte_fbk_hash_entry current_entry; + uint32_t i; + + for (i = 0; i < ht->entries_per_bucket; i++) { + /* Single read of entry, which should be atomic. */ + current_entry.whole_entry = ht->t[bucket + i].whole_entry; + if (! current_entry.entry.is_entry) { + return -ENOENT; /* Error once we hit an empty field. */ + } + if (current_entry.entry.key == key) { + return current_entry.entry.value; + } + } + return -ENOENT; /* Key didn't exist. */ +} + +/** + * Find a key in the hash table. This operation is multi-thread safe. + * + * @param ht + * Hash table to look in. + * @param key + * Key to find. + * @return + * The value that was associated with the key, or negative value on error. + */ +static inline int +rte_fbk_hash_lookup(const struct rte_fbk_hash_table *ht, uint32_t key) +{ + return rte_fbk_hash_lookup_with_bucket(ht, + key, rte_fbk_hash_get_bucket(ht, key)); +} + +/** + * Delete all entries in a hash table. This operation is not multi-thread + * safe and should only be called from one thread. + * + * @param ht + * Hash table to delete entries in. + */ +static inline void +rte_fbk_hash_clear_all(struct rte_fbk_hash_table *ht) +{ + memset(ht->t, 0, sizeof(ht->t[0]) * ht->entries); + ht->used_entries = 0; +} + +/** + * Find what fraction of entries are being used. + * + * @param ht + * Hash table to find how many entries are being used in. + * @return + * Load factor of the hash table, or negative value on error. + */ +static inline double +rte_fbk_hash_get_load_factor(struct rte_fbk_hash_table *ht) +{ + return (double)ht->used_entries / (double)ht->entries; +} + +/** + * Performs a lookup for an existing hash table, and returns a pointer to + * the table if found. + * + * @param name + * Name of the hash table to find + * + * @return + * pointer to hash table structure or NULL on error with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_fbk_hash_table *rte_fbk_hash_find_existing(const char *name); + +/** + * Create a new hash table for use with four byte keys. + * + * @param params + * Parameters used in creation of hash table. + * + * @return + * Pointer to hash table structure that is used in future hash table + * operations, or NULL on error with rte_errno set appropriately. + * Possible rte_errno error values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter value passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_fbk_hash_table * \ +rte_fbk_hash_create(const struct rte_fbk_hash_params *params); + +/** + * Free all memory used by a hash table. + * Has no effect on hash tables allocated in memory zones + * + * @param ht + * Hash table to deallocate. + */ +void rte_fbk_hash_free(struct rte_fbk_hash_table *ht); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_FBK_HASH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_hash/rte_hash.h b/src/spdk/dpdk/lib/librte_hash/rte_hash.h new file mode 100644 index 00000000..9e7d9315 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_hash.h @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation + */ + +#ifndef _RTE_HASH_H_ +#define _RTE_HASH_H_ + +/** + * @file + * + * RTE Hash Table + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** Maximum size of hash table that can be created. */ +#define RTE_HASH_ENTRIES_MAX (1 << 30) + +/** Maximum number of characters in hash name.*/ +#define RTE_HASH_NAMESIZE 32 + +/** Maximum number of keys that can be searched for using rte_hash_lookup_bulk. */ +#define RTE_HASH_LOOKUP_BULK_MAX 64 +#define RTE_HASH_LOOKUP_MULTI_MAX RTE_HASH_LOOKUP_BULK_MAX + +/** Enable Hardware transactional memory support. */ +#define RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT 0x01 + +/** Default behavior of insertion, single writer/multi writer */ +#define RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD 0x02 + +/** Flag to support reader writer concurrency */ +#define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04 + +/** Signature of key that is stored internally. */ +typedef uint32_t hash_sig_t; + +/** Type of function that can be used for calculating the hash value. */ +typedef uint32_t (*rte_hash_function)(const void *key, uint32_t key_len, + uint32_t init_val); + +/** Type of function used to compare the hash key. */ +typedef int (*rte_hash_cmp_eq_t)(const void *key1, const void *key2, size_t key_len); + +/** + * Parameters used when creating the hash table. + */ +struct rte_hash_parameters { + const char *name; /**< Name of the hash. */ + uint32_t entries; /**< Total hash table entries. */ + uint32_t reserved; /**< Unused field. Should be set to 0 */ + uint32_t key_len; /**< Length of hash key. */ + rte_hash_function hash_func; /**< Primary Hash function used to calculate hash. */ + uint32_t hash_func_init_val; /**< Init value used by hash_func. */ + int socket_id; /**< NUMA Socket ID for memory. */ + uint8_t extra_flag; /**< Indicate if additional parameters are present. */ +}; + +/** @internal A hash table structure. */ +struct rte_hash; + +/** + * Create a new hash table. + * + * @param params + * Parameters used to create and initialise the hash table. + * @return + * Pointer to hash table structure that is used in future hash table + * operations, or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - ENOENT - missing entry + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_hash * +rte_hash_create(const struct rte_hash_parameters *params); + +/** + * Set a new hash compare function other than the default one. + * + * @note Function pointer does not work with multi-process, so do not use it + * in multi-process mode. + * + * @param h + * Hash table for which the function is to be changed + * @param func + * New compare function + */ +void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func); + +/** + * Find an existing hash table object and return a pointer to it. + * + * @param name + * Name of the hash table as passed to rte_hash_create() + * @return + * Pointer to hash table or NULL if object not found + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - value not available for return + */ +struct rte_hash * +rte_hash_find_existing(const char *name); + +/** + * De-allocate all memory used by hash table. + * @param h + * Hash table to free + */ +void +rte_hash_free(struct rte_hash *h); + +/** + * Reset all hash structure, by zeroing all entries + * @param h + * Hash table to reset + */ +void +rte_hash_reset(struct rte_hash *h); + +/** + * Return the number of keys in the hash table + * @param h + * Hash table to query from + * @return + * - -EINVAL if parameters are invalid + * - A value indicating how many keys were inserted in the table. + */ +int32_t +rte_hash_count(const struct rte_hash *h); + +/** + * Add a key-value pair to an existing hash table. + * This operation is not multi-thread safe + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to add the key to. + * @param key + * Key to add to the hash table. + * @param data + * Data to add to the hash table. + * @return + * - 0 if added successfully + * - -EINVAL if the parameters are invalid. + * - -ENOSPC if there is no space in the hash for this key. + */ +int +rte_hash_add_key_data(const struct rte_hash *h, const void *key, void *data); + +/** + * Add a key-value pair with a pre-computed hash value + * to an existing hash table. + * This operation is not multi-thread safe + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to add the key to. + * @param key + * Key to add to the hash table. + * @param sig + * Precomputed hash value for 'key' + * @param data + * Data to add to the hash table. + * @return + * - 0 if added successfully + * - -EINVAL if the parameters are invalid. + * - -ENOSPC if there is no space in the hash for this key. + */ +int32_t +rte_hash_add_key_with_hash_data(const struct rte_hash *h, const void *key, + hash_sig_t sig, void *data); + +/** + * Add a key to an existing hash table. This operation is not multi-thread safe + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to add the key to. + * @param key + * Key to add to the hash table. + * @return + * - -EINVAL if the parameters are invalid. + * - -ENOSPC if there is no space in the hash for this key. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key. + */ +int32_t +rte_hash_add_key(const struct rte_hash *h, const void *key); + +/** + * Add a key to an existing hash table. + * This operation is not multi-thread safe + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to add the key to. + * @param key + * Key to add to the hash table. + * @param sig + * Precomputed hash value for 'key'. + * @return + * - -EINVAL if the parameters are invalid. + * - -ENOSPC if there is no space in the hash for this key. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key. + */ +int32_t +rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig); + +/** + * Remove a key from an existing hash table. + * This operation is not multi-thread safe + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to remove the key from. + * @param key + * Key to remove from the hash table. + * @return + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + */ +int32_t +rte_hash_del_key(const struct rte_hash *h, const void *key); + +/** + * Remove a key from an existing hash table. + * This operation is not multi-thread safe + * and should only be called from one thread by default. + * Thread safety can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to remove the key from. + * @param key + * Key to remove from the hash table. + * @param sig + * Precomputed hash value for 'key'. + * @return + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + */ +int32_t +rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key, hash_sig_t sig); + +/** + * Find a key in the hash table given the position. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to get the key from. + * @param position + * Position returned when the key was inserted. + * @param key + * Output containing a pointer to the key + * @return + * - 0 if retrieved successfully + * - -EINVAL if the parameters are invalid. + * - -ENOENT if no valid key is found in the given position. + */ +int +rte_hash_get_key_with_position(const struct rte_hash *h, const int32_t position, + void **key); + +/** + * Find a key-value pair in the hash table. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to look in. + * @param key + * Key to find. + * @param data + * Output with pointer to data returned from the hash table. + * @return + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. + */ +int +rte_hash_lookup_data(const struct rte_hash *h, const void *key, void **data); + +/** + * Find a key-value pair with a pre-computed hash value + * to an existing hash table. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to look in. + * @param key + * Key to find. + * @param sig + * Precomputed hash value for 'key' + * @param data + * Output with pointer to data returned from the hash table. + * @return + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. + */ +int +rte_hash_lookup_with_hash_data(const struct rte_hash *h, const void *key, + hash_sig_t sig, void **data); + +/** + * Find a key in the hash table. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to look in. + * @param key + * Key to find. + * @return + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + */ +int32_t +rte_hash_lookup(const struct rte_hash *h, const void *key); + +/** + * Find a key in the hash table. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to look in. + * @param key + * Key to find. + * @param sig + * Precomputed hash value for 'key'. + * @return + * - -EINVAL if the parameters are invalid. + * - -ENOENT if the key is not found. + * - A positive value that can be used by the caller as an offset into an + * array of user data. This value is unique for this key, and is the same + * value that was returned when the key was added. + */ +int32_t +rte_hash_lookup_with_hash(const struct rte_hash *h, + const void *key, hash_sig_t sig); + +/** + * Calc a hash value by key. + * This operation is not multi-thread safe. + * + * @param h + * Hash table to look in. + * @param key + * Key to find. + * @return + * - hash value + */ +hash_sig_t +rte_hash_hash(const struct rte_hash *h, const void *key); + +/** + * Find multiple keys in the hash table. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to look in. + * @param keys + * A pointer to a list of keys to look for. + * @param num_keys + * How many keys are in the keys list (less than RTE_HASH_LOOKUP_BULK_MAX). + * @param hit_mask + * Output containing a bitmask with all successful lookups. + * @param data + * Output containing array of data returned from all the successful lookups. + * @return + * -EINVAL if there's an error, otherwise number of successful lookups. + */ +int +rte_hash_lookup_bulk_data(const struct rte_hash *h, const void **keys, + uint32_t num_keys, uint64_t *hit_mask, void *data[]); + +/** + * Find multiple keys in the hash table. + * This operation is multi-thread safe with regarding to other lookup threads. + * Read-write concurrency can be enabled by setting flag during + * table creation. + * + * @param h + * Hash table to look in. + * @param keys + * A pointer to a list of keys to look for. + * @param num_keys + * How many keys are in the keys list (less than RTE_HASH_LOOKUP_BULK_MAX). + * @param positions + * Output containing a list of values, corresponding to the list of keys that + * can be used by the caller as an offset into an array of user data. These + * values are unique for each key, and are the same values that were returned + * when each key was added. If a key in the list was not found, then -ENOENT + * will be the value. + * @return + * -EINVAL if there's an error, otherwise 0. + */ +int +rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys, + uint32_t num_keys, int32_t *positions); + +/** + * Iterate through the hash table, returning key-value pairs. + * + * @param h + * Hash table to iterate + * @param key + * Output containing the key where current iterator + * was pointing at + * @param data + * Output containing the data associated with key. + * Returns NULL if data was not stored. + * @param next + * Pointer to iterator. Should be 0 to start iterating the hash table. + * Iterator is incremented after each call of this function. + * @return + * Position where key was stored, if successful. + * - -EINVAL if the parameters are invalid. + * - -ENOENT if end of the hash table. + */ +int32_t +rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32_t *next); +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_HASH_H_ */ diff --git a/src/spdk/dpdk/lib/librte_hash/rte_hash_crc.h b/src/spdk/dpdk/lib/librte_hash/rte_hash_crc.h new file mode 100644 index 00000000..cf28031b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_hash_crc.h @@ -0,0 +1,601 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_HASH_CRC_H_ +#define _RTE_HASH_CRC_H_ + +/** + * @file + * + * RTE CRC Hash + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include + +/* Lookup tables for software implementation of CRC32C */ +static const uint32_t crc32c_tables[8][256] = {{ + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 +}, +{ + 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, + 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD, + 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4, + 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C, + 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47, + 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF, + 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6, + 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E, + 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41, + 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9, + 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0, + 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78, + 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43, + 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB, + 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2, + 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A, + 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC, + 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004, + 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D, + 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185, + 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE, + 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306, + 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F, + 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287, + 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8, + 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600, + 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439, + 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781, + 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA, + 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502, + 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B, + 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 +}, +{ + 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, + 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC, + 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3, + 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726, + 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D, + 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8, + 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7, + 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32, + 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0, + 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75, + 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A, + 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF, + 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4, + 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161, + 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E, + 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB, + 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A, + 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF, + 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0, + 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065, + 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E, + 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB, + 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4, + 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71, + 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3, + 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36, + 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79, + 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC, + 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7, + 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622, + 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D, + 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 +}, +{ + 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, + 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C, + 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7, + 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11, + 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41, + 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7, + 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C, + 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A, + 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D, + 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB, + 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610, + 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6, + 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6, + 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040, + 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B, + 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D, + 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5, + 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213, + 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8, + 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E, + 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E, + 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698, + 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443, + 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5, + 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12, + 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4, + 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F, + 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9, + 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99, + 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F, + 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4, + 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 +}, +{ + 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44, + 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, 0x2522B521, 0x1D33DA8D, 0x55006A79, 0x6D1105D5, + 0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, 0x6F67DF63, 0x5776B0CF, 0x1F45003B, 0x27546F97, + 0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406, + 0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, 0x539B05A3, 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13, + 0xDECFBEC6, 0xE6DED16A, 0xAEED619E, 0x96FC0E32, 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082, + 0x948AD484, 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0, + 0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, 0xF99BD151, + 0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, 0xD714D41E, 0xEF05BBB2, 0xA7360B46, 0x9F2764EA, + 0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, 0x1273DF8F, 0x2A62B023, 0x625100D7, 0x5A406F7B, + 0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539, + 0x7D1400EC, 0x45056F40, 0x0D36DFB4, 0x3527B018, 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8, + 0x2CF9DFF9, 0x14E8B055, 0x5CDB00A1, 0x64CA6F0D, 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD, + 0xE99ED468, 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C, + 0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, 0x0BADB06E, + 0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, 0x86F90B0B, 0xBEE864A7, 0xF6DBD453, 0xCECABBFF, + 0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, 0x8EE76BEC, 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18, + 0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089, + 0xE180B48F, 0xD991DB23, 0x91A26BD7, 0xA9B3047B, 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB, + 0x24E7BF1E, 0x1CF6D0B2, 0x54C56046, 0x6CD40FEA, 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A, + 0x750A600B, 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F, + 0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, 0x5028D52A, 0x6839BA86, 0x200A0A72, 0x181B65DE, + 0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, 0x1A6DBF68, 0x227CD0C4, 0x6A4F6030, 0x525E0F9C, + 0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, 0xDF0AB4F9, 0xE71BDB55, 0xAF286BA1, 0x9739040D, + 0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6, + 0x9C94B463, 0xA485DBCF, 0xECB66B3B, 0xD4A70497, 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27, + 0xD6D1DE21, 0xEEC0B18D, 0xA6F30179, 0x9EE26ED5, 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065, + 0x13B6D5B0, 0x2BA7BA1C, 0x63940AE8, 0x5B856544, 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4, + 0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, 0xEA2D04E1, + 0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, 0x6779BF84, 0x5F68D028, 0x175B60DC, 0x2F4A0F70, + 0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, 0x2D3CD5C6, 0x152DBA6A, 0x5D1E0A9E, 0x650F6532, + 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3 +}, +{ + 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD, + 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, 0xD2F32F68, 0x3DC34471, 0x097F8FAB, 0xE64FE4B2, + 0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, 0x72F90749, 0x9DC96C50, 0xA975A78A, 0x4645CC93, + 0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C, + 0x85F4168D, 0x6AC47D94, 0x5E78B64E, 0xB148DD57, 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20, + 0xE5F20E92, 0x0AC2658B, 0x3E7EAE51, 0xD14EC548, 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F, + 0x45F826B3, 0xAAC84DAA, 0x9E748670, 0x7144ED69, 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E, + 0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, 0x970B09DB, 0x783B62C2, 0x4C87A918, 0xA3B7C201, + 0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, 0xBCF16C9C, 0x53C10785, 0x677DCC5F, 0x884DA746, + 0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, 0xDCF77483, 0x33C71F9A, 0x077BD440, 0xE84BBF59, + 0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778, + 0xAE0E73CA, 0x413E18D3, 0x7582D309, 0x9AB2B810, 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67, + 0x8BF04D66, 0x64C0267F, 0x507CEDA5, 0xBF4C86BC, 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB, + 0xEBF65579, 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4, + 0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, 0xF9094A2F, 0x16392136, 0x2285EAEC, 0xCDB581F5, + 0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, 0x990F5230, 0x763F3929, 0x4283F2F3, 0xADB399EA, + 0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, 0xAEFD80A1, 0x41CDEBB8, 0x75712062, 0x9A414B7B, + 0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364, + 0xDC0487E8, 0x3334ECF1, 0x0788272B, 0xE8B84C32, 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45, + 0xBC029FF7, 0x5332F4EE, 0x678E3F34, 0x88BE542D, 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A, + 0x99FCA15B, 0x76CCCA42, 0x42700198, 0xAD406A81, 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6, + 0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, 0x7FB345E9, + 0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, 0xEB05A612, 0x0435CD0B, 0x308906D1, 0xDFB96DC8, + 0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, 0x8B03BE0D, 0x6433D514, 0x508F1ECE, 0xBFBF75D7, + 0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090, + 0x720AF422, 0x9D3A9F3B, 0xA98654E1, 0x46B63FF8, 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F, + 0xD200DC03, 0x3D30B71A, 0x098C7CC0, 0xE6BC17D9, 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE, + 0xB206C41C, 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1, + 0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, 0x11B1061D, + 0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, 0x450BD5D8, 0xAA3BBEC1, 0x9E87751B, 0x71B71E02, + 0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, 0xE501FDF9, 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623, + 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C +}, +{ + 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089, + 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, 0xEBCD3882, 0x83CE144A, 0x3BCB6112, 0x53C84DDA, + 0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, 0x39BB3F77, 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F, + 0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C, + 0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, 0x855EF6E5, 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334, + 0x73767EEE, 0x1B755226, 0xA370277E, 0xCB730BB6, 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67, + 0xA100791B, 0xC90355D3, 0x7106208B, 0x19050C43, 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992, + 0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, 0xF2C834C1, + 0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, 0xDF57C2AB, 0xB754EE63, 0x0F519B3B, 0x6752B7F3, + 0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, 0x917A3FF8, 0xF9791330, 0x417C6668, 0x297F4AA0, + 0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55, + 0xA8C1008F, 0xC0C22C47, 0x78C7591F, 0x10C475D7, 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006, + 0x47EC84C7, 0x2FEFA80F, 0x97EADD57, 0xFFE9F19F, 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E, + 0x09C17994, 0x61C2555C, 0xD9C72004, 0xB1C40CCC, 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D, + 0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, 0x7E57BBB0, 0x16549778, 0xAE51E220, 0xC652CEE8, + 0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, 0x307A46E3, 0x58796A2B, 0xE07C1F73, 0x887F33BB, + 0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, 0x508ECB25, 0x388DE7ED, 0x808892B5, 0xE88BBE7D, + 0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E, + 0x6935F452, 0x0136D89A, 0xB933ADC2, 0xD130810A, 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB, + 0x27180901, 0x4F1B25C9, 0xF71E5091, 0x9F1D7C59, 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988, + 0xC8358D49, 0xA036A181, 0x1833D4D9, 0x7030F811, 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0, + 0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, 0x9BFDC093, + 0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, 0xF18EB23E, 0x998D9EF6, 0x2188EBAE, 0x498BC766, + 0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, 0xBFA34F6D, 0xD7A063A5, 0x6FA516FD, 0x07A63A35, + 0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907, + 0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, 0x79F18185, 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454, + 0x1382F328, 0x7B81DFE0, 0xC384AAB8, 0xAB878670, 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1, + 0x5DAF0E7B, 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2, + 0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, 0x17624FE2, 0x7F61632A, 0xC7641672, 0xAF673ABA, + 0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, 0x594FB2B1, 0x314C9E79, 0x8949EB21, 0xE14AC7E9, + 0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, 0x8B39B544, 0xE33A998C, 0x5B3FECD4, 0x333CC01C, + 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F +}, +{ + 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504, + 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, 0x632686B7, 0x2A1AFB90, 0xF15E7CF9, 0xB86201DE, + 0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, 0xA56B8BD9, 0xEC57F6FE, 0x37137197, 0x7E2F0CB0, + 0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A, + 0x0D006599, 0x443C18BE, 0x9F789FD7, 0xD644E2F0, 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D, + 0x4F3B6143, 0x06071C64, 0xDD439B0D, 0x947FE62A, 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447, + 0x89766C2D, 0xC04A110A, 0x1B0E9663, 0x5232EB44, 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929, + 0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, 0xEA50EA9A, 0xA36C97BD, 0x782810D4, 0x31146DF3, + 0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, 0x3B1D495F, 0x72213478, 0xA965B311, 0xE059CE36, + 0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, 0x79264D85, 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC, + 0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782, + 0xDC4DC65C, 0x9571BB7B, 0x4E353C12, 0x07094135, 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358, + 0x1700AEAB, 0x5E3CD38C, 0x857854E5, 0xCC4429C2, 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF, + 0x553BAA71, 0x1C07D756, 0xC743503F, 0x8E7F2D18, 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75, + 0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, 0xB26B2572, 0xFB575855, 0x2013DF3C, 0x692FA21B, + 0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, 0xF05021A8, 0xB96C5C8F, 0x6228DBE6, 0x2B14A6C1, + 0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, 0x151C1409, 0x5C20692E, 0x8764EE47, 0xCE589360, + 0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA, + 0xB0779FD0, 0xF94BE2F7, 0x220F659E, 0x6B3318B9, 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4, + 0xF24C9B0A, 0xBB70E62D, 0x60346144, 0x29081C63, 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E, + 0x3901F3FD, 0x703D8EDA, 0xAB7909B3, 0xE2457494, 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9, + 0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, 0x5A27754A, 0x131B086D, 0xC85F8F04, 0x8163F223, + 0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, 0x9C6A7824, 0xD5560503, 0x0E12826A, 0x472EFF4D, + 0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, 0xDE517CFE, 0x976D01D9, 0x4C2986B0, 0x0515FB97, + 0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852, + 0x6C3A598C, 0x250624AB, 0xFE42A3C2, 0xB77EDEE5, 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88, + 0xAA7754E2, 0xE34B29C5, 0x380FAEAC, 0x7133D38B, 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6, + 0xE84C5038, 0xA1702D1F, 0x7A34AA76, 0x3308D751, 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C, + 0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, 0x021CBAA2, 0x4B20C785, 0x906440EC, 0xD9583DCB, + 0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, 0x4027BE78, 0x091BC35F, 0xD25F4436, 0x9B633911, + 0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, 0x866AB316, 0xCF56CE31, 0x14124958, 0x5D2E347F, + 0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5 +}}; + +#define CRC32_UPD(crc, n) \ + (crc32c_tables[(n)][(crc) & 0xFF] ^ \ + crc32c_tables[(n)-1][((crc) >> 8) & 0xFF]) + +static inline uint32_t +crc32c_1byte(uint8_t data, uint32_t init_val) +{ + uint32_t crc; + crc = init_val; + crc ^= data; + + return crc32c_tables[0][crc & 0xff] ^ (crc >> 8); +} + +static inline uint32_t +crc32c_2bytes(uint16_t data, uint32_t init_val) +{ + uint32_t crc; + crc = init_val; + crc ^= data; + + crc = CRC32_UPD(crc, 1) ^ (crc >> 16); + + return crc; +} + +static inline uint32_t +crc32c_1word(uint32_t data, uint32_t init_val) +{ + uint32_t crc, term1, term2; + crc = init_val; + crc ^= data; + + term1 = CRC32_UPD(crc, 3); + term2 = crc >> 16; + crc = term1 ^ CRC32_UPD(term2, 1); + + return crc; +} + +static inline uint32_t +crc32c_2words(uint64_t data, uint32_t init_val) +{ + uint32_t crc, term1, term2; + union { + uint64_t u64; + uint32_t u32[2]; + } d; + d.u64 = data; + + crc = init_val; + crc ^= d.u32[0]; + + term1 = CRC32_UPD(crc, 7); + term2 = crc >> 16; + crc = term1 ^ CRC32_UPD(term2, 5); + term1 = CRC32_UPD(d.u32[1], 3); + term2 = d.u32[1] >> 16; + crc ^= term1 ^ CRC32_UPD(term2, 1); + + return crc; +} + +#if defined(RTE_ARCH_X86) +static inline uint32_t +crc32c_sse42_u8(uint8_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32b %[data], %[init_val];" + : [init_val] "+r" (init_val) + : [data] "rm" (data)); + return init_val; +} + +static inline uint32_t +crc32c_sse42_u16(uint16_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32w %[data], %[init_val];" + : [init_val] "+r" (init_val) + : [data] "rm" (data)); + return init_val; +} + +static inline uint32_t +crc32c_sse42_u32(uint32_t data, uint32_t init_val) +{ + __asm__ volatile( + "crc32l %[data], %[init_val];" + : [init_val] "+r" (init_val) + : [data] "rm" (data)); + return init_val; +} + +static inline uint32_t +crc32c_sse42_u64_mimic(uint64_t data, uint64_t init_val) +{ + union { + uint32_t u32[2]; + uint64_t u64; + } d; + + d.u64 = data; + init_val = crc32c_sse42_u32(d.u32[0], (uint32_t)init_val); + init_val = crc32c_sse42_u32(d.u32[1], (uint32_t)init_val); + return (uint32_t)init_val; +} +#endif + +#ifdef RTE_ARCH_X86_64 +static inline uint32_t +crc32c_sse42_u64(uint64_t data, uint64_t init_val) +{ + __asm__ volatile( + "crc32q %[data], %[init_val];" + : [init_val] "+r" (init_val) + : [data] "rm" (data)); + return (uint32_t)init_val; +} +#endif + +#define CRC32_SW (1U << 0) +#define CRC32_SSE42 (1U << 1) +#define CRC32_x64 (1U << 2) +#define CRC32_SSE42_x64 (CRC32_x64|CRC32_SSE42) +#define CRC32_ARM64 (1U << 3) + +static uint8_t crc32_alg = CRC32_SW; + +#if defined(RTE_ARCH_ARM64) && defined(RTE_MACHINE_CPUFLAG_CRC32) +#include "rte_crc_arm64.h" +#else + +/** + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash + * calculation. + * + * @param alg + * An OR of following flags: + * - (CRC32_SW) Don't use SSE4.2 intrinsics + * - (CRC32_SSE42) Use SSE4.2 intrinsics if available + * - (CRC32_SSE42_x64) Use 64-bit SSE4.2 intrinsic if available (default) + * + */ +static inline void +rte_hash_crc_set_alg(uint8_t alg) +{ +#if defined(RTE_ARCH_X86) + if (alg == CRC32_SSE42_x64 && + !rte_cpu_get_flag_enabled(RTE_CPUFLAG_EM64T)) + alg = CRC32_SSE42; +#endif + crc32_alg = alg; +} + +/* Setting the best available algorithm */ +RTE_INIT(rte_hash_crc_init_alg) +{ + rte_hash_crc_set_alg(CRC32_SSE42_x64); +} + +/** + * Use single crc32 instruction to perform a hash on a byte value. + * Fall back to software crc32 implementation in case SSE4.2 is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_1byte(uint8_t data, uint32_t init_val) +{ +#if defined RTE_ARCH_X86 + if (likely(crc32_alg & CRC32_SSE42)) + return crc32c_sse42_u8(data, init_val); +#endif + + return crc32c_1byte(data, init_val); +} + +/** + * Use single crc32 instruction to perform a hash on a 2 bytes value. + * Fall back to software crc32 implementation in case SSE4.2 is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_2byte(uint16_t data, uint32_t init_val) +{ +#if defined RTE_ARCH_X86 + if (likely(crc32_alg & CRC32_SSE42)) + return crc32c_sse42_u16(data, init_val); +#endif + + return crc32c_2bytes(data, init_val); +} + +/** + * Use single crc32 instruction to perform a hash on a 4 byte value. + * Fall back to software crc32 implementation in case SSE4.2 is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_4byte(uint32_t data, uint32_t init_val) +{ +#if defined RTE_ARCH_X86 + if (likely(crc32_alg & CRC32_SSE42)) + return crc32c_sse42_u32(data, init_val); +#endif + + return crc32c_1word(data, init_val); +} + +/** + * Use single crc32 instruction to perform a hash on a 8 byte value. + * Fall back to software crc32 implementation in case SSE4.2 is + * not supported + * + * @param data + * Data to perform hash on. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc_8byte(uint64_t data, uint32_t init_val) +{ +#ifdef RTE_ARCH_X86_64 + if (likely(crc32_alg == CRC32_SSE42_x64)) + return crc32c_sse42_u64(data, init_val); +#endif + +#if defined RTE_ARCH_X86 + if (likely(crc32_alg & CRC32_SSE42)) + return crc32c_sse42_u64_mimic(data, init_val); +#endif + + return crc32c_2words(data, init_val); +} + +#endif + +/** + * Calculate CRC32 hash on user-supplied byte array. + * + * @param data + * Data to perform hash on. + * @param data_len + * How many bytes to use to calculate hash value. + * @param init_val + * Value to initialise hash generator. + * @return + * 32bit calculated hash value. + */ +static inline uint32_t +rte_hash_crc(const void *data, uint32_t data_len, uint32_t init_val) +{ + unsigned i; + uintptr_t pd = (uintptr_t) data; + + for (i = 0; i < data_len / 8; i++) { + init_val = rte_hash_crc_8byte(*(const uint64_t *)pd, init_val); + pd += 8; + } + + if (data_len & 0x4) { + init_val = rte_hash_crc_4byte(*(const uint32_t *)pd, init_val); + pd += 4; + } + + if (data_len & 0x2) { + init_val = rte_hash_crc_2byte(*(const uint16_t *)pd, init_val); + pd += 2; + } + + if (data_len & 0x1) + init_val = rte_hash_crc_1byte(*(const uint8_t *)pd, init_val); + + return init_val; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_HASH_CRC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_hash/rte_hash_version.map b/src/spdk/dpdk/lib/librte_hash/rte_hash_version.map new file mode 100644 index 00000000..e216ac8e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_hash_version.map @@ -0,0 +1,55 @@ +DPDK_2.0 { + global: + + rte_fbk_hash_create; + rte_fbk_hash_find_existing; + rte_fbk_hash_free; + rte_hash_add_key; + rte_hash_add_key_with_hash; + rte_hash_create; + rte_hash_del_key; + rte_hash_del_key_with_hash; + rte_hash_find_existing; + rte_hash_free; + rte_hash_hash; + rte_hash_lookup; + rte_hash_lookup_bulk; + rte_hash_lookup_with_hash; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_hash_add_key_data; + rte_hash_add_key_with_hash_data; + rte_hash_iterate; + rte_hash_lookup_bulk_data; + rte_hash_lookup_data; + rte_hash_lookup_with_hash_data; + rte_hash_reset; + +} DPDK_2.0; + +DPDK_2.2 { + global: + + rte_hash_set_cmp_func; + +} DPDK_2.1; + +DPDK_16.07 { + global: + + rte_hash_get_key_with_position; + +} DPDK_2.2; + + +DPDK_18.08 { + global: + + rte_hash_count; + +} DPDK_16.07; diff --git a/src/spdk/dpdk/lib/librte_hash/rte_jhash.h b/src/spdk/dpdk/lib/librte_hash/rte_jhash.h new file mode 100644 index 00000000..42c45685 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_jhash.h @@ -0,0 +1,414 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_JHASH_H +#define _RTE_JHASH_H + +/** + * @file + * + * jhash functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include +#include +#include + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() + * are externally useful functions. Routines to test the hash are included + * if SELF_TEST is defined. You can use this free for any purpose. It's in + * the public domain. It has no warranty. + * + * $FreeBSD$ + */ + +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k)))) + +/** @internal Internal function. NOTE: Arguments are modified. */ +#define __rte_jhash_mix(a, b, c) do { \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c, 16); c += b; \ + b -= a; b ^= rot(a, 19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} while (0) + +#define __rte_jhash_final(a, b, c) do { \ + c ^= b; c -= rot(b, 14); \ + a ^= c; a -= rot(c, 11); \ + b ^= a; b -= rot(a, 25); \ + c ^= b; c -= rot(b, 16); \ + a ^= c; a -= rot(c, 4); \ + b ^= a; b -= rot(a, 14); \ + c ^= b; c -= rot(b, 24); \ +} while (0) + +/** The golden ratio: an arbitrary value. */ +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +#define BIT_SHIFT(x, y, k) (((x) >> (k)) | ((uint64_t)(y) << (32-(k)))) +#else +#define BIT_SHIFT(x, y, k) (((uint64_t)(x) << (k)) | ((y) >> (32-(k)))) +#endif + +#define LOWER8b_MASK rte_le_to_cpu_32(0xff) +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff) +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff) + +static inline void +__rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, + uint32_t *pb, unsigned check_align) +{ + uint32_t a, b, c; + + /* Set up the internal state */ + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + *pc; + c += *pb; + + /* + * Check key alignment. For x86 architecture, first case is always optimal + * If check_align is not set, first case will be used + */ +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_X32) + const uint32_t *k = (const uint32_t *)key; + const uint32_t s = 0; +#else + const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3); + const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT; +#endif + if (!check_align || s == 0) { + while (length > 12) { + a += k[0]; + b += k[1]; + c += k[2]; + + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + switch (length) { + case 12: + c += k[2]; b += k[1]; a += k[0]; break; + case 11: + c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break; + case 10: + c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break; + case 9: + c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break; + case 8: + b += k[1]; a += k[0]; break; + case 7: + b += k[1] & LOWER24b_MASK; a += k[0]; break; + case 6: + b += k[1] & LOWER16b_MASK; a += k[0]; break; + case 5: + b += k[1] & LOWER8b_MASK; a += k[0]; break; + case 4: + a += k[0]; break; + case 3: + a += k[0] & LOWER24b_MASK; break; + case 2: + a += k[0] & LOWER16b_MASK; break; + case 1: + a += k[0] & LOWER8b_MASK; break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + }; + } else { + /* all but the last block: affect some 32 bits of (a, b, c) */ + while (length > 12) { + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + __rte_jhash_mix(a, b, c); + + k += 3; + length -= 12; + } + + /* last block: affect all 32 bits of (c) */ + switch (length) { + case 12: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s); + break; + case 11: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER24b_MASK; + break; + case 10: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER16b_MASK; + break; + case 9: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + c += BIT_SHIFT(k[2], k[3], s) & LOWER8b_MASK; + break; + case 8: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s); + break; + case 7: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER24b_MASK; + break; + case 6: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER16b_MASK; + break; + case 5: + a += BIT_SHIFT(k[0], k[1], s); + b += BIT_SHIFT(k[1], k[2], s) & LOWER8b_MASK; + break; + case 4: + a += BIT_SHIFT(k[0], k[1], s); + break; + case 3: + a += BIT_SHIFT(k[0], k[1], s) & LOWER24b_MASK; + break; + case 2: + a += BIT_SHIFT(k[0], k[1], s) & LOWER16b_MASK; + break; + case 1: + a += BIT_SHIFT(k[0], k[1], s) & LOWER8b_MASK; + break; + /* zero length strings require no mixing */ + case 0: + *pc = c; + *pb = b; + return; + } + } + + __rte_jhash_final(a, b, c); + + *pc = c; + *pb = b; +} + +/** + * Same as rte_jhash, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes(key, length, pc, pb, 1); +} + +/** + * Same as rte_jhash_32b, but takes two seeds and return two uint32_ts. + * pc and pb must be non-null, and *pc and *pb must both be initialized + * with seeds. If you pass in (*pb)=0, the output (*pc) will be + * the same as the return value from rte_jhash_32b. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param pc + * IN: seed OUT: primary hash value. + * @param pb + * IN: second seed OUT: secondary hash value. + */ +static inline void +rte_jhash_32b_2hashes(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb) +{ + __rte_jhash_2hashes((const void *) k, (length << 2), pc, pb, 0); +} + +/** + * The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. For keys not aligned to four byte boundaries + * or a multiple of four bytes in length, the memory region + * just after may be read (but not used in the computation). + * This may cross a page boundary. + * + * @param key + * Key to calculate hash of. + * @param length + * Length of key in bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash(const void *key, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_2hashes(key, length, &initval, &initval2); + + return initval; +} + +/** + * A special optimized version that handles 1 or more of uint32_ts. + * The length parameter here is the number of uint32_ts in the key. + * + * @param k + * Key to calculate hash of. + * @param length + * Length of key in units of 4 bytes. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_32b(const uint32_t *k, uint32_t length, uint32_t initval) +{ + uint32_t initval2 = 0; + + rte_jhash_32b_2hashes(k, length, &initval, &initval2); + + return initval; +} + +static inline uint32_t +__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + a += RTE_JHASH_GOLDEN_RATIO + initval; + b += RTE_JHASH_GOLDEN_RATIO + initval; + c += RTE_JHASH_GOLDEN_RATIO + initval; + + __rte_jhash_final(a, b, c); + + return c; +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 3 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param c + * Third word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval) +{ + return __rte_jhash_3words(a + 12, b + 12, c + 12, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 2 words. + * + * @param a + * First word to calculate hash of. + * @param b + * Second word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval) +{ + return __rte_jhash_3words(a + 8, b + 8, 8, initval); +} + +/** + * A special ultra-optimized versions that knows it is hashing exactly + * 1 word. + * + * @param a + * Word to calculate hash of. + * @param initval + * Initialising value of hash. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_jhash_1word(uint32_t a, uint32_t initval) +{ + return __rte_jhash_3words(a + 4, 4, 4, initval); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_JHASH_H */ diff --git a/src/spdk/dpdk/lib/librte_hash/rte_thash.h b/src/spdk/dpdk/lib/librte_hash/rte_thash.h new file mode 100644 index 00000000..a6ddb7bf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_hash/rte_thash.h @@ -0,0 +1,257 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2015 Vladimir Medvedkin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_THASH_H +#define _RTE_THASH_H + +/** + * @file + * + * toeplitz hash functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Software implementation of the Toeplitz hash function used by RSS. + * Can be used either for packet distribution on single queue NIC + * or for simulating of RSS computation on specific NIC (for example + * after GRE header decapsulating) + */ + +#include +#include +#include +#include +#include + +#if defined(RTE_ARCH_X86) || defined(RTE_MACHINE_CPUFLAG_NEON) +#include +#endif + +#ifdef RTE_ARCH_X86 +/* Byte swap mask used for converting IPv6 address + * 4-byte chunks to CPU byte order + */ +static const __m128i rte_thash_ipv6_bswap_mask = { + 0x0405060700010203ULL, 0x0C0D0E0F08090A0BULL}; +#endif + +/** + * length in dwords of input tuple to + * calculate hash of ipv4 header only + */ +#define RTE_THASH_V4_L3_LEN ((sizeof(struct rte_ipv4_tuple) - \ + sizeof(((struct rte_ipv4_tuple *)0)->sctp_tag)) / 4) + +/** + * length in dwords of input tuple to + * calculate hash of ipv4 header + + * transport header + */ +#define RTE_THASH_V4_L4_LEN ((sizeof(struct rte_ipv4_tuple)) / 4) + +/** + * length in dwords of input tuple to + * calculate hash of ipv6 header only + */ +#define RTE_THASH_V6_L3_LEN ((sizeof(struct rte_ipv6_tuple) - \ + sizeof(((struct rte_ipv6_tuple *)0)->sctp_tag)) / 4) + +/** + * length in dwords of input tuple to + * calculate hash of ipv6 header + + * transport header + */ +#define RTE_THASH_V6_L4_LEN ((sizeof(struct rte_ipv6_tuple)) / 4) + +/** + * IPv4 tuple + * addresses and ports/sctp_tag have to be CPU byte order + */ +struct rte_ipv4_tuple { + uint32_t src_addr; + uint32_t dst_addr; + RTE_STD_C11 + union { + struct { + uint16_t dport; + uint16_t sport; + }; + uint32_t sctp_tag; + }; +}; + +/** + * IPv6 tuple + * Addresses have to be filled by rte_thash_load_v6_addr() + * ports/sctp_tag have to be CPU byte order + */ +struct rte_ipv6_tuple { + uint8_t src_addr[16]; + uint8_t dst_addr[16]; + RTE_STD_C11 + union { + struct { + uint16_t dport; + uint16_t sport; + }; + uint32_t sctp_tag; + }; +}; + +union rte_thash_tuple { + struct rte_ipv4_tuple v4; + struct rte_ipv6_tuple v6; +#ifdef RTE_ARCH_X86 +} __attribute__((aligned(XMM_SIZE))); +#else +}; +#endif + +/** + * Prepare special converted key to use with rte_softrss_be() + * @param orig + * pointer to original RSS key + * @param targ + * pointer to target RSS key + * @param len + * RSS key length + */ +static inline void +rte_convert_rss_key(const uint32_t *orig, uint32_t *targ, int len) +{ + int i; + + for (i = 0; i < (len >> 2); i++) + targ[i] = rte_be_to_cpu_32(orig[i]); +} + +/** + * Prepare and load IPv6 addresses (src and dst) + * into target tuple + * @param orig + * Pointer to ipv6 header of the original packet + * @param targ + * Pointer to rte_ipv6_tuple structure + */ +static inline void +rte_thash_load_v6_addrs(const struct ipv6_hdr *orig, union rte_thash_tuple *targ) +{ +#ifdef RTE_ARCH_X86 + __m128i ipv6 = _mm_loadu_si128((const __m128i *)orig->src_addr); + *(__m128i *)targ->v6.src_addr = + _mm_shuffle_epi8(ipv6, rte_thash_ipv6_bswap_mask); + ipv6 = _mm_loadu_si128((const __m128i *)orig->dst_addr); + *(__m128i *)targ->v6.dst_addr = + _mm_shuffle_epi8(ipv6, rte_thash_ipv6_bswap_mask); +#elif defined(RTE_MACHINE_CPUFLAG_NEON) + uint8x16_t ipv6 = vld1q_u8((uint8_t const *)orig->src_addr); + vst1q_u8((uint8_t *)targ->v6.src_addr, vrev32q_u8(ipv6)); + ipv6 = vld1q_u8((uint8_t const *)orig->dst_addr); + vst1q_u8((uint8_t *)targ->v6.dst_addr, vrev32q_u8(ipv6)); +#else + int i; + for (i = 0; i < 4; i++) { + *((uint32_t *)targ->v6.src_addr + i) = + rte_be_to_cpu_32(*((const uint32_t *)orig->src_addr + i)); + *((uint32_t *)targ->v6.dst_addr + i) = + rte_be_to_cpu_32(*((const uint32_t *)orig->dst_addr + i)); + } +#endif +} + +/** + * Generic implementation. Can be used with original rss_key + * @param input_tuple + * Pointer to input tuple + * @param input_len + * Length of input_tuple in 4-bytes chunks + * @param rss_key + * Pointer to RSS hash key. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_softrss(uint32_t *input_tuple, uint32_t input_len, + const uint8_t *rss_key) +{ + uint32_t i, j, map, ret = 0; + + for (j = 0; j < input_len; j++) { + for (map = input_tuple[j]; map; map &= (map - 1)) { + i = rte_bsf32(map); + ret ^= rte_cpu_to_be_32(((const uint32_t *)rss_key)[j]) << (31 - i) | + (uint32_t)((uint64_t)(rte_cpu_to_be_32(((const uint32_t *)rss_key)[j + 1])) >> + (i + 1)); + } + } + return ret; +} + +/** + * Optimized implementation. + * If you want the calculated hash value matches NIC RSS value + * you have to use special converted key with rte_convert_rss_key() fn. + * @param input_tuple + * Pointer to input tuple + * @param input_len + * Length of input_tuple in 4-bytes chunks + * @param *rss_key + * Pointer to RSS hash key. + * @return + * Calculated hash value. + */ +static inline uint32_t +rte_softrss_be(uint32_t *input_tuple, uint32_t input_len, + const uint8_t *rss_key) +{ + uint32_t i, j, map, ret = 0; + + for (j = 0; j < input_len; j++) { + for (map = input_tuple[j]; map; map &= (map - 1)) { + i = rte_bsf32(map); + ret ^= ((const uint32_t *)rss_key)[j] << (31 - i) | + (uint32_t)((uint64_t)(((const uint32_t *)rss_key)[j + 1]) >> (i + 1)); + } + } + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_THASH_H */ diff --git a/src/spdk/dpdk/lib/librte_ip_frag/Makefile b/src/spdk/dpdk/lib/librte_ip_frag/Makefile new file mode 100644 index 00000000..4c3dc4d3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/Makefile @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_ip_frag.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev +LDLIBS += -lrte_hash + +EXPORT_MAP := rte_ip_frag_version.map + +LIBABIVER := 1 + +#source files +SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += rte_ipv4_fragmentation.c +SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += rte_ipv6_fragmentation.c +SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += rte_ipv4_reassembly.c +SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += rte_ipv6_reassembly.c +SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += rte_ip_frag_common.c +SRCS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += ip_frag_internal.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_IP_FRAG)-include += rte_ip_frag.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_ip_frag/ip_frag_common.h b/src/spdk/dpdk/lib/librte_ip_frag/ip_frag_common.h new file mode 100644 index 00000000..197acf8d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/ip_frag_common.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _IP_FRAG_COMMON_H_ +#define _IP_FRAG_COMMON_H_ + +#include "rte_ip_frag.h" + +/* logging macros. */ +#ifdef RTE_LIBRTE_IP_FRAG_DEBUG +#define IP_FRAG_LOG(lvl, fmt, args...) RTE_LOG(lvl, USER1, fmt, ##args) +#else +#define IP_FRAG_LOG(lvl, fmt, args...) do {} while(0) +#endif /* IP_FRAG_DEBUG */ + +#define IPV4_KEYLEN 1 +#define IPV6_KEYLEN 4 + +/* helper macros */ +#define IP_FRAG_MBUF2DR(dr, mb) ((dr)->row[(dr)->cnt++] = (mb)) + +#define IPv6_KEY_BYTES(key) \ + (key)[0], (key)[1], (key)[2], (key)[3] +#define IPv6_KEY_BYTES_FMT \ + "%08" PRIx64 "%08" PRIx64 "%08" PRIx64 "%08" PRIx64 + +/* internal functions declarations */ +struct rte_mbuf * ip_frag_process(struct ip_frag_pkt *fp, + struct rte_ip_frag_death_row *dr, struct rte_mbuf *mb, + uint16_t ofs, uint16_t len, uint16_t more_frags); + +struct ip_frag_pkt * ip_frag_find(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, + const struct ip_frag_key *key, uint64_t tms); + +struct ip_frag_pkt * ip_frag_lookup(struct rte_ip_frag_tbl *tbl, + const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale); + +/* these functions need to be declared here as ip_frag_process relies on them */ +struct rte_mbuf *ipv4_frag_reassemble(struct ip_frag_pkt *fp); +struct rte_mbuf *ipv6_frag_reassemble(struct ip_frag_pkt *fp); + + + +/* + * misc frag key functions + */ + +/* check if key is empty */ +static inline int +ip_frag_key_is_empty(const struct ip_frag_key * key) +{ + uint32_t i; + for (i = 0; i < RTE_MIN(key->key_len, RTE_DIM(key->src_dst)); i++) + if (key->src_dst[i] != 0) + return 0; + return 1; +} + +/* empty the key */ +static inline void +ip_frag_key_invalidate(struct ip_frag_key * key) +{ + uint32_t i; + for (i = 0; i < key->key_len; i++) + key->src_dst[i] = 0; +} + +/* compare two keys */ +static inline int +ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2) +{ + uint32_t i, val; + val = k1->id ^ k2->id; + for (i = 0; i < k1->key_len; i++) + val |= k1->src_dst[i] ^ k2->src_dst[i]; + return val; +} + +/* + * misc fragment functions + */ + +/* put fragment on death row */ +static inline void +ip_frag_free(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr) +{ + uint32_t i, k; + + k = dr->cnt; + for (i = 0; i != fp->last_idx; i++) { + if (fp->frags[i].mb != NULL) { + dr->row[k++] = fp->frags[i].mb; + fp->frags[i].mb = NULL; + } + } + + fp->last_idx = 0; + dr->cnt = k; +} + +/* delete fragment's mbufs immediately instead of using death row */ +static inline void +ip_frag_free_immediate(struct ip_frag_pkt *fp) +{ + uint32_t i; + + for (i = 0; i < fp->last_idx; i++) { + if (fp->frags[i].mb != NULL) { + IP_FRAG_LOG(DEBUG, "%s:%d\n" + "mbuf: %p, tms: %" PRIu64", key: <%" PRIx64 ", %#x>\n", + __func__, __LINE__, fp->frags[i].mb, fp->start, + fp->key.src_dst[0], fp->key.id); + rte_pktmbuf_free(fp->frags[i].mb); + fp->frags[i].mb = NULL; + } + } + + fp->last_idx = 0; +} + +/* if key is empty, mark key as in use */ +static inline void +ip_frag_inuse(struct rte_ip_frag_tbl *tbl, const struct ip_frag_pkt *fp) +{ + if (ip_frag_key_is_empty(&fp->key)) { + TAILQ_REMOVE(&tbl->lru, fp, lru); + tbl->use_entries--; + } +} + +/* reset the fragment */ +static inline void +ip_frag_reset(struct ip_frag_pkt *fp, uint64_t tms) +{ + static const struct ip_frag zero_frag = { + .ofs = 0, + .len = 0, + .mb = NULL, + }; + + fp->start = tms; + fp->total_size = UINT32_MAX; + fp->frag_size = 0; + fp->last_idx = IP_MIN_FRAG_NUM; + fp->frags[IP_LAST_FRAG_IDX] = zero_frag; + fp->frags[IP_FIRST_FRAG_IDX] = zero_frag; +} + +#endif /* _IP_FRAG_COMMON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ip_frag/ip_frag_internal.c b/src/spdk/dpdk/lib/librte_ip_frag/ip_frag_internal.c new file mode 100644 index 00000000..2560c771 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/ip_frag_internal.c @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include +#include + +#include "ip_frag_common.h" + +#define PRIME_VALUE 0xeaad8405 + +#define IP_FRAG_TBL_POS(tbl, sig) \ + ((tbl)->pkt + ((sig) & (tbl)->entry_mask)) + +#ifdef RTE_LIBRTE_IP_FRAG_TBL_STAT +#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) ((s)->f += (v)) +#else +#define IP_FRAG_TBL_STAT_UPDATE(s, f, v) do {} while (0) +#endif /* IP_FRAG_TBL_STAT */ + +/* local frag table helper functions */ +static inline void +ip_frag_tbl_del(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, + struct ip_frag_pkt *fp) +{ + ip_frag_free(fp, dr); + ip_frag_key_invalidate(&fp->key); + TAILQ_REMOVE(&tbl->lru, fp, lru); + tbl->use_entries--; + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, del_num, 1); +} + +static inline void +ip_frag_tbl_add(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *fp, + const struct ip_frag_key *key, uint64_t tms) +{ + fp->key = key[0]; + ip_frag_reset(fp, tms); + TAILQ_INSERT_TAIL(&tbl->lru, fp, lru); + tbl->use_entries++; + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, add_num, 1); +} + +static inline void +ip_frag_tbl_reuse(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, + struct ip_frag_pkt *fp, uint64_t tms) +{ + ip_frag_free(fp, dr); + ip_frag_reset(fp, tms); + TAILQ_REMOVE(&tbl->lru, fp, lru); + TAILQ_INSERT_TAIL(&tbl->lru, fp, lru); + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, reuse_num, 1); +} + + +static inline void +ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2) +{ + uint32_t v; + const uint32_t *p; + + p = (const uint32_t *)&key->src_dst; + +#ifdef RTE_ARCH_X86 + v = rte_hash_crc_4byte(p[0], PRIME_VALUE); + v = rte_hash_crc_4byte(p[1], v); + v = rte_hash_crc_4byte(key->id, v); +#else + + v = rte_jhash_3words(p[0], p[1], key->id, PRIME_VALUE); +#endif /* RTE_ARCH_X86 */ + + *v1 = v; + *v2 = (v << 7) + (v >> 14); +} + +static inline void +ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2) +{ + uint32_t v; + const uint32_t *p; + + p = (const uint32_t *) &key->src_dst; + +#ifdef RTE_ARCH_X86 + v = rte_hash_crc_4byte(p[0], PRIME_VALUE); + v = rte_hash_crc_4byte(p[1], v); + v = rte_hash_crc_4byte(p[2], v); + v = rte_hash_crc_4byte(p[3], v); + v = rte_hash_crc_4byte(p[4], v); + v = rte_hash_crc_4byte(p[5], v); + v = rte_hash_crc_4byte(p[6], v); + v = rte_hash_crc_4byte(p[7], v); + v = rte_hash_crc_4byte(key->id, v); +#else + + v = rte_jhash_3words(p[0], p[1], p[2], PRIME_VALUE); + v = rte_jhash_3words(p[3], p[4], p[5], v); + v = rte_jhash_3words(p[6], p[7], key->id, v); +#endif /* RTE_ARCH_X86 */ + + *v1 = v; + *v2 = (v << 7) + (v >> 14); +} + +struct rte_mbuf * +ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr, + struct rte_mbuf *mb, uint16_t ofs, uint16_t len, uint16_t more_frags) +{ + uint32_t idx; + + fp->frag_size += len; + + /* this is the first fragment. */ + if (ofs == 0) { + idx = (fp->frags[IP_FIRST_FRAG_IDX].mb == NULL) ? + IP_FIRST_FRAG_IDX : UINT32_MAX; + + /* this is the last fragment. */ + } else if (more_frags == 0) { + fp->total_size = ofs + len; + idx = (fp->frags[IP_LAST_FRAG_IDX].mb == NULL) ? + IP_LAST_FRAG_IDX : UINT32_MAX; + + /* this is the intermediate fragment. */ + } else if ((idx = fp->last_idx) < + sizeof (fp->frags) / sizeof (fp->frags[0])) { + fp->last_idx++; + } + + /* + * erroneous packet: either exceed max allowed number of fragments, + * or duplicate first/last fragment encountered. + */ + if (idx >= sizeof (fp->frags) / sizeof (fp->frags[0])) { + + /* report an error. */ + if (fp->key.key_len == IPV4_KEYLEN) + IP_FRAG_LOG(DEBUG, "%s:%d invalid fragmented packet:\n" + "ipv4_frag_pkt: %p, key: <%" PRIx64 ", %#x>, " + "total_size: %u, frag_size: %u, last_idx: %u\n" + "first fragment: ofs: %u, len: %u\n" + "last fragment: ofs: %u, len: %u\n\n", + __func__, __LINE__, + fp, fp->key.src_dst[0], fp->key.id, + fp->total_size, fp->frag_size, fp->last_idx, + fp->frags[IP_FIRST_FRAG_IDX].ofs, + fp->frags[IP_FIRST_FRAG_IDX].len, + fp->frags[IP_LAST_FRAG_IDX].ofs, + fp->frags[IP_LAST_FRAG_IDX].len); + else + IP_FRAG_LOG(DEBUG, "%s:%d invalid fragmented packet:\n" + "ipv6_frag_pkt: %p, key: <" IPv6_KEY_BYTES_FMT ", %#x>, " + "total_size: %u, frag_size: %u, last_idx: %u\n" + "first fragment: ofs: %u, len: %u\n" + "last fragment: ofs: %u, len: %u\n\n", + __func__, __LINE__, + fp, IPv6_KEY_BYTES(fp->key.src_dst), fp->key.id, + fp->total_size, fp->frag_size, fp->last_idx, + fp->frags[IP_FIRST_FRAG_IDX].ofs, + fp->frags[IP_FIRST_FRAG_IDX].len, + fp->frags[IP_LAST_FRAG_IDX].ofs, + fp->frags[IP_LAST_FRAG_IDX].len); + + /* free all fragments, invalidate the entry. */ + ip_frag_free(fp, dr); + ip_frag_key_invalidate(&fp->key); + IP_FRAG_MBUF2DR(dr, mb); + + return NULL; + } + + fp->frags[idx].ofs = ofs; + fp->frags[idx].len = len; + fp->frags[idx].mb = mb; + + mb = NULL; + + /* not all fragments are collected yet. */ + if (likely (fp->frag_size < fp->total_size)) { + return mb; + + /* if we collected all fragments, then try to reassemble. */ + } else if (fp->frag_size == fp->total_size && + fp->frags[IP_FIRST_FRAG_IDX].mb != NULL) { + if (fp->key.key_len == IPV4_KEYLEN) + mb = ipv4_frag_reassemble(fp); + else + mb = ipv6_frag_reassemble(fp); + } + + /* errorenous set of fragments. */ + if (mb == NULL) { + + /* report an error. */ + if (fp->key.key_len == IPV4_KEYLEN) + IP_FRAG_LOG(DEBUG, "%s:%d invalid fragmented packet:\n" + "ipv4_frag_pkt: %p, key: <%" PRIx64 ", %#x>, " + "total_size: %u, frag_size: %u, last_idx: %u\n" + "first fragment: ofs: %u, len: %u\n" + "last fragment: ofs: %u, len: %u\n\n", + __func__, __LINE__, + fp, fp->key.src_dst[0], fp->key.id, + fp->total_size, fp->frag_size, fp->last_idx, + fp->frags[IP_FIRST_FRAG_IDX].ofs, + fp->frags[IP_FIRST_FRAG_IDX].len, + fp->frags[IP_LAST_FRAG_IDX].ofs, + fp->frags[IP_LAST_FRAG_IDX].len); + else + IP_FRAG_LOG(DEBUG, "%s:%d invalid fragmented packet:\n" + "ipv6_frag_pkt: %p, key: <" IPv6_KEY_BYTES_FMT ", %#x>, " + "total_size: %u, frag_size: %u, last_idx: %u\n" + "first fragment: ofs: %u, len: %u\n" + "last fragment: ofs: %u, len: %u\n\n", + __func__, __LINE__, + fp, IPv6_KEY_BYTES(fp->key.src_dst), fp->key.id, + fp->total_size, fp->frag_size, fp->last_idx, + fp->frags[IP_FIRST_FRAG_IDX].ofs, + fp->frags[IP_FIRST_FRAG_IDX].len, + fp->frags[IP_LAST_FRAG_IDX].ofs, + fp->frags[IP_LAST_FRAG_IDX].len); + + /* free associated resources. */ + ip_frag_free(fp, dr); + } + + /* we are done with that entry, invalidate it. */ + ip_frag_key_invalidate(&fp->key); + return mb; +} + + +/* + * Find an entry in the table for the corresponding fragment. + * If such entry is not present, then allocate a new one. + * If the entry is stale, then free and reuse it. + */ +struct ip_frag_pkt * +ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, + const struct ip_frag_key *key, uint64_t tms) +{ + struct ip_frag_pkt *pkt, *free, *stale, *lru; + uint64_t max_cycles; + + /* + * Actually the two line below are totally redundant. + * they are here, just to make gcc 4.6 happy. + */ + free = NULL; + stale = NULL; + max_cycles = tbl->max_cycles; + + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, find_num, 1); + + if ((pkt = ip_frag_lookup(tbl, key, tms, &free, &stale)) == NULL) { + + /*timed-out entry, free and invalidate it*/ + if (stale != NULL) { + ip_frag_tbl_del(tbl, dr, stale); + free = stale; + + /* + * we found a free entry, check if we can use it. + * If we run out of free entries in the table, then + * check if we have a timed out entry to delete. + */ + } else if (free != NULL && + tbl->max_entries <= tbl->use_entries) { + lru = TAILQ_FIRST(&tbl->lru); + if (max_cycles + lru->start < tms) { + ip_frag_tbl_del(tbl, dr, lru); + } else { + free = NULL; + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, + fail_nospace, 1); + } + } + + /* found a free entry to reuse. */ + if (free != NULL) { + ip_frag_tbl_add(tbl, free, key, tms); + pkt = free; + } + + /* + * we found the flow, but it is already timed out, + * so free associated resources, reposition it in the LRU list, + * and reuse it. + */ + } else if (max_cycles + pkt->start < tms) { + ip_frag_tbl_reuse(tbl, dr, pkt, tms); + } + + IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, fail_total, (pkt == NULL)); + + tbl->last = pkt; + return pkt; +} + +struct ip_frag_pkt * +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, + const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +{ + struct ip_frag_pkt *p1, *p2; + struct ip_frag_pkt *empty, *old; + uint64_t max_cycles; + uint32_t i, assoc, sig1, sig2; + + empty = NULL; + old = NULL; + + max_cycles = tbl->max_cycles; + assoc = tbl->bucket_entries; + + if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0) + return tbl->last; + + /* different hashing methods for IPv4 and IPv6 */ + if (key->key_len == IPV4_KEYLEN) + ipv4_frag_hash(key, &sig1, &sig2); + else + ipv6_frag_hash(key, &sig1, &sig2); + + p1 = IP_FRAG_TBL_POS(tbl, sig1); + p2 = IP_FRAG_TBL_POS(tbl, sig2); + + for (i = 0; i != assoc; i++) { + if (p1->key.key_len == IPV4_KEYLEN) + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv4_frag_pkt line0: %p, index: %u from %u\n" + "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, + tbl, tbl->max_entries, tbl->use_entries, + p1, i, assoc, + p1[i].key.src_dst[0], p1[i].key.id, p1[i].start); + else + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv6_frag_pkt line0: %p, index: %u from %u\n" + "key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, + tbl, tbl->max_entries, tbl->use_entries, + p1, i, assoc, + IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start); + + if (ip_frag_key_cmp(key, &p1[i].key) == 0) + return p1 + i; + else if (ip_frag_key_is_empty(&p1[i].key)) + empty = (empty == NULL) ? (p1 + i) : empty; + else if (max_cycles + p1[i].start < tms) + old = (old == NULL) ? (p1 + i) : old; + + if (p2->key.key_len == IPV4_KEYLEN) + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv4_frag_pkt line1: %p, index: %u from %u\n" + "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, + tbl, tbl->max_entries, tbl->use_entries, + p2, i, assoc, + p2[i].key.src_dst[0], p2[i].key.id, p2[i].start); + else + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv6_frag_pkt line1: %p, index: %u from %u\n" + "key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, + tbl, tbl->max_entries, tbl->use_entries, + p2, i, assoc, + IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start); + + if (ip_frag_key_cmp(key, &p2[i].key) == 0) + return p2 + i; + else if (ip_frag_key_is_empty(&p2[i].key)) + empty = (empty == NULL) ?( p2 + i) : empty; + else if (max_cycles + p2[i].start < tms) + old = (old == NULL) ? (p2 + i) : old; + } + + *free = empty; + *stale = old; + return NULL; +} diff --git a/src/spdk/dpdk/lib/librte_ip_frag/meson.build b/src/spdk/dpdk/lib/librte_ip_frag/meson.build new file mode 100644 index 00000000..c5b9a459 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_ipv4_fragmentation.c', + 'rte_ipv6_fragmentation.c', + 'rte_ipv4_reassembly.c', + 'rte_ipv6_reassembly.c', + 'rte_ip_frag_common.c', + 'ip_frag_internal.c') +headers = files('rte_ip_frag.h') +deps += ['ethdev', 'hash'] diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag.h b/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag.h new file mode 100644 index 00000000..b3f3f78d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_IP_FRAG_H_ +#define _RTE_IP_FRAG_H_ + +/** + * @file + * RTE IP Fragmentation and Reassembly + * + * Implementation of IP packet fragmentation and reassembly. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include +#include +#include +#include + +struct rte_mbuf; + +enum { + IP_LAST_FRAG_IDX, /**< index of last fragment */ + IP_FIRST_FRAG_IDX, /**< index of first fragment */ + IP_MIN_FRAG_NUM, /**< minimum number of fragments */ + IP_MAX_FRAG_NUM = RTE_LIBRTE_IP_FRAG_MAX_FRAG, + /**< maximum number of fragments per packet */ +}; + +/** @internal fragmented mbuf */ +struct ip_frag { + uint16_t ofs; /**< offset into the packet */ + uint16_t len; /**< length of fragment */ + struct rte_mbuf *mb; /**< fragment mbuf */ +}; + +/** @internal to uniquely identify fragmented datagram. */ +struct ip_frag_key { + uint64_t src_dst[4]; /**< src address, first 8 bytes used for IPv4 */ + uint32_t id; /**< dst address */ + uint32_t key_len; /**< src/dst key length */ +}; + +/** + * @internal Fragmented packet to reassemble. + * First two entries in the frags[] array are for the last and first fragments. + */ +struct ip_frag_pkt { + TAILQ_ENTRY(ip_frag_pkt) lru; /**< LRU list */ + struct ip_frag_key key; /**< fragmentation key */ + uint64_t start; /**< creation timestamp */ + uint32_t total_size; /**< expected reassembled size */ + uint32_t frag_size; /**< size of fragments received */ + uint32_t last_idx; /**< index of next entry to fill */ + struct ip_frag frags[IP_MAX_FRAG_NUM]; /**< fragments */ +} __rte_cache_aligned; + +#define IP_FRAG_DEATH_ROW_LEN 32 /**< death row size (in packets) */ + +/** mbuf death row (packets to be freed) */ +struct rte_ip_frag_death_row { + uint32_t cnt; /**< number of mbufs currently on death row */ + struct rte_mbuf *row[IP_FRAG_DEATH_ROW_LEN * (IP_MAX_FRAG_NUM + 1)]; + /**< mbufs to be freed */ +}; + +TAILQ_HEAD(ip_pkt_list, ip_frag_pkt); /**< @internal fragments tailq */ + +/** fragmentation table statistics */ +struct ip_frag_tbl_stat { + uint64_t find_num; /**< total # of find/insert attempts. */ + uint64_t add_num; /**< # of add ops. */ + uint64_t del_num; /**< # of del ops. */ + uint64_t reuse_num; /**< # of reuse (del/add) ops. */ + uint64_t fail_total; /**< total # of add failures. */ + uint64_t fail_nospace; /**< # of 'no space' add failures. */ +} __rte_cache_aligned; + +/** fragmentation table */ +struct rte_ip_frag_tbl { + uint64_t max_cycles; /**< ttl for table entries. */ + uint32_t entry_mask; /**< hash value mask. */ + uint32_t max_entries; /**< max entries allowed. */ + uint32_t use_entries; /**< entries in use. */ + uint32_t bucket_entries; /**< hash associativity. */ + uint32_t nb_entries; /**< total size of the table. */ + uint32_t nb_buckets; /**< num of associativity lines. */ + struct ip_frag_pkt *last; /**< last used entry. */ + struct ip_pkt_list lru; /**< LRU list for table entries. */ + struct ip_frag_tbl_stat stat; /**< statistics counters. */ + __extension__ struct ip_frag_pkt pkt[0]; /**< hash table. */ +}; + +/** IPv6 fragment extension header */ +#define RTE_IPV6_EHDR_MF_SHIFT 0 +#define RTE_IPV6_EHDR_MF_MASK 1 +#define RTE_IPV6_EHDR_FO_SHIFT 3 +#define RTE_IPV6_EHDR_FO_MASK (~((1 << RTE_IPV6_EHDR_FO_SHIFT) - 1)) + +#define RTE_IPV6_FRAG_USED_MASK \ + (RTE_IPV6_EHDR_MF_MASK | RTE_IPV6_EHDR_FO_MASK) + +#define RTE_IPV6_GET_MF(x) ((x) & RTE_IPV6_EHDR_MF_MASK) +#define RTE_IPV6_GET_FO(x) ((x) >> RTE_IPV6_EHDR_FO_SHIFT) + +#define RTE_IPV6_SET_FRAG_DATA(fo, mf) \ + (((fo) & RTE_IPV6_EHDR_FO_MASK) | ((mf) & RTE_IPV6_EHDR_MF_MASK)) + +struct ipv6_extension_fragment { + uint8_t next_header; /**< Next header type */ + uint8_t reserved; /**< Reserved */ + uint16_t frag_data; /**< All fragmentation data */ + uint32_t id; /**< Packet ID */ +} __attribute__((__packed__)); + + + +/** + * Create a new IP fragmentation table. + * + * @param bucket_num + * Number of buckets in the hash table. + * @param bucket_entries + * Number of entries per bucket (e.g. hash associativity). + * Should be power of two. + * @param max_entries + * Maximum number of entries that could be stored in the table. + * The value should be less or equal then bucket_num * bucket_entries. + * @param max_cycles + * Maximum TTL in cycles for each fragmented packet. + * @param socket_id + * The *socket_id* argument is the socket identifier in the case of + * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA constraints. + * @return + * The pointer to the new allocated fragmentation table, on success. NULL on error. + */ +struct rte_ip_frag_tbl * rte_ip_frag_table_create(uint32_t bucket_num, + uint32_t bucket_entries, uint32_t max_entries, + uint64_t max_cycles, int socket_id); + +/** + * Free allocated IP fragmentation table. + * + * @param tbl + * Fragmentation table to free. + */ +void +rte_ip_frag_table_destroy(struct rte_ip_frag_tbl *tbl); + +/** + * This function implements the fragmentation of IPv6 packets. + * + * @param pkt_in + * The input packet. + * @param pkts_out + * Array storing the output fragments. + * @param nb_pkts_out + * Number of fragments. + * @param mtu_size + * Size in bytes of the Maximum Transfer Unit (MTU) for the outgoing IPv6 + * datagrams. This value includes the size of the IPv6 header. + * @param pool_direct + * MBUF pool used for allocating direct buffers for the output fragments. + * @param pool_indirect + * MBUF pool used for allocating indirect buffers for the output fragments. + * @return + * Upon successful completion - number of output fragments placed + * in the pkts_out array. + * Otherwise - (-1) * errno. + */ +int32_t +rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out, + uint16_t mtu_size, + struct rte_mempool *pool_direct, + struct rte_mempool *pool_indirect); + +/** + * This function implements reassembly of fragmented IPv6 packets. + * Incoming mbuf should have its l2_len/l3_len fields setup correctly. + * + * @param tbl + * Table where to lookup/add the fragmented packet. + * @param dr + * Death row to free buffers to + * @param mb + * Incoming mbuf with IPv6 fragment. + * @param tms + * Fragment arrival timestamp. + * @param ip_hdr + * Pointer to the IPv6 header. + * @param frag_hdr + * Pointer to the IPv6 fragment extension header. + * @return + * Pointer to mbuf for reassembled packet, or NULL if: + * - an error occurred. + * - not all fragments of the packet are collected yet. + */ +struct rte_mbuf *rte_ipv6_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, + struct rte_mbuf *mb, uint64_t tms, struct ipv6_hdr *ip_hdr, + struct ipv6_extension_fragment *frag_hdr); + +/** + * Return a pointer to the packet's fragment header, if found. + * It only looks at the extension header that's right after the fixed IPv6 + * header, and doesn't follow the whole chain of extension headers. + * + * @param hdr + * Pointer to the IPv6 header. + * @return + * Pointer to the IPv6 fragment extension header, or NULL if it's not + * present. + */ +static inline struct ipv6_extension_fragment * +rte_ipv6_frag_get_ipv6_fragment_header(struct ipv6_hdr *hdr) +{ + if (hdr->proto == IPPROTO_FRAGMENT) { + return (struct ipv6_extension_fragment *) ++hdr; + } + else + return NULL; +} + +/** + * IPv4 fragmentation. + * + * This function implements the fragmentation of IPv4 packets. + * + * @param pkt_in + * The input packet. + * @param pkts_out + * Array storing the output fragments. + * @param nb_pkts_out + * Number of fragments. + * @param mtu_size + * Size in bytes of the Maximum Transfer Unit (MTU) for the outgoing IPv4 + * datagrams. This value includes the size of the IPv4 header. + * @param pool_direct + * MBUF pool used for allocating direct buffers for the output fragments. + * @param pool_indirect + * MBUF pool used for allocating indirect buffers for the output fragments. + * @return + * Upon successful completion - number of output fragments placed + * in the pkts_out array. + * Otherwise - (-1) * errno. + */ +int32_t rte_ipv4_fragment_packet(struct rte_mbuf *pkt_in, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out, uint16_t mtu_size, + struct rte_mempool *pool_direct, + struct rte_mempool *pool_indirect); + +/** + * This function implements reassembly of fragmented IPv4 packets. + * Incoming mbufs should have its l2_len/l3_len fields setup correclty. + * + * @param tbl + * Table where to lookup/add the fragmented packet. + * @param dr + * Death row to free buffers to + * @param mb + * Incoming mbuf with IPv4 fragment. + * @param tms + * Fragment arrival timestamp. + * @param ip_hdr + * Pointer to the IPV4 header inside the fragment. + * @return + * Pointer to mbuf for reassembled packet, or NULL if: + * - an error occurred. + * - not all fragments of the packet are collected yet. + */ +struct rte_mbuf * rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, + struct rte_mbuf *mb, uint64_t tms, struct ipv4_hdr *ip_hdr); + +/** + * Check if the IPv4 packet is fragmented + * + * @param hdr + * IPv4 header of the packet + * @return + * 1 if fragmented, 0 if not fragmented + */ +static inline int +rte_ipv4_frag_pkt_is_fragmented(const struct ipv4_hdr * hdr) { + uint16_t flag_offset, ip_flag, ip_ofs; + + flag_offset = rte_be_to_cpu_16(hdr->fragment_offset); + ip_ofs = (uint16_t)(flag_offset & IPV4_HDR_OFFSET_MASK); + ip_flag = (uint16_t)(flag_offset & IPV4_HDR_MF_FLAG); + + return ip_flag != 0 || ip_ofs != 0; +} + +/** + * Free mbufs on a given death row. + * + * @param dr + * Death row to free mbufs in. + * @param prefetch + * How many buffers to prefetch before freeing. + */ +void rte_ip_frag_free_death_row(struct rte_ip_frag_death_row *dr, + uint32_t prefetch); + + +/** + * Dump fragmentation table statistics to file. + * + * @param f + * File to dump statistics to + * @param tbl + * Fragmentation table to dump statistics from + */ +void +rte_ip_frag_table_statistics_dump(FILE * f, const struct rte_ip_frag_tbl *tbl); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IP_FRAG_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_common.c b/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_common.c new file mode 100644 index 00000000..659a1795 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_common.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include + +#include "ip_frag_common.h" + +#define IP_FRAG_HASH_FNUM 2 + +/* free mbufs from death row */ +void +rte_ip_frag_free_death_row(struct rte_ip_frag_death_row *dr, + uint32_t prefetch) +{ + uint32_t i, k, n; + + k = RTE_MIN(prefetch, dr->cnt); + n = dr->cnt; + + for (i = 0; i != k; i++) + rte_prefetch0(dr->row[i]); + + for (i = 0; i != n - k; i++) { + rte_prefetch0(dr->row[i + k]); + rte_pktmbuf_free(dr->row[i]); + } + + for (; i != n; i++) + rte_pktmbuf_free(dr->row[i]); + + dr->cnt = 0; +} + +/* create fragmentation table */ +struct rte_ip_frag_tbl * +rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries, + uint32_t max_entries, uint64_t max_cycles, int socket_id) +{ + struct rte_ip_frag_tbl *tbl; + size_t sz; + uint64_t nb_entries; + + nb_entries = rte_align32pow2(bucket_num); + nb_entries *= bucket_entries; + nb_entries *= IP_FRAG_HASH_FNUM; + + /* check input parameters. */ + if (rte_is_power_of_2(bucket_entries) == 0 || + nb_entries > UINT32_MAX || nb_entries == 0 || + nb_entries < max_entries) { + RTE_LOG(ERR, USER1, "%s: invalid input parameter\n", __func__); + return NULL; + } + + sz = sizeof (*tbl) + nb_entries * sizeof (tbl->pkt[0]); + if ((tbl = rte_zmalloc_socket(__func__, sz, RTE_CACHE_LINE_SIZE, + socket_id)) == NULL) { + RTE_LOG(ERR, USER1, + "%s: allocation of %zu bytes at socket %d failed do\n", + __func__, sz, socket_id); + return NULL; + } + + RTE_LOG(INFO, USER1, "%s: allocated of %zu bytes at socket %d\n", + __func__, sz, socket_id); + + tbl->max_cycles = max_cycles; + tbl->max_entries = max_entries; + tbl->nb_entries = (uint32_t)nb_entries; + tbl->nb_buckets = bucket_num; + tbl->bucket_entries = bucket_entries; + tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries - 1); + + TAILQ_INIT(&(tbl->lru)); + return tbl; +} + +/* delete fragmentation table */ +void +rte_ip_frag_table_destroy(struct rte_ip_frag_tbl *tbl) +{ + struct ip_frag_pkt *fp; + + TAILQ_FOREACH(fp, &tbl->lru, lru) { + ip_frag_free_immediate(fp); + } + + rte_free(tbl); +} + +/* dump frag table statistics to file */ +void +rte_ip_frag_table_statistics_dump(FILE *f, const struct rte_ip_frag_tbl *tbl) +{ + uint64_t fail_total, fail_nospace; + + fail_total = tbl->stat.fail_total; + fail_nospace = tbl->stat.fail_nospace; + + fprintf(f, "max entries:\t%u;\n" + "entries in use:\t%u;\n" + "finds/inserts:\t%" PRIu64 ";\n" + "entries added:\t%" PRIu64 ";\n" + "entries deleted by timeout:\t%" PRIu64 ";\n" + "entries reused by timeout:\t%" PRIu64 ";\n" + "total add failures:\t%" PRIu64 ";\n" + "add no-space failures:\t%" PRIu64 ";\n" + "add hash-collisions failures:\t%" PRIu64 ";\n", + tbl->max_entries, + tbl->use_entries, + tbl->stat.find_num, + tbl->stat.add_num, + tbl->stat.del_num, + tbl->stat.reuse_num, + fail_total, + fail_nospace, + fail_total - fail_nospace); +} diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_version.map b/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_version.map new file mode 100644 index 00000000..d1acf07c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ip_frag_version.map @@ -0,0 +1,20 @@ +DPDK_2.0 { + global: + + rte_ip_frag_free_death_row; + rte_ip_frag_table_create; + rte_ip_frag_table_statistics_dump; + rte_ipv4_frag_reassemble_packet; + rte_ipv4_fragment_packet; + rte_ipv6_frag_reassemble_packet; + rte_ipv6_fragment_packet; + + local: *; +}; + +DPDK_17.08 { + global: + + rte_ip_frag_table_destroy; + +} DPDK_2.0; diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_fragmentation.c b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_fragmentation.c new file mode 100644 index 00000000..a96fb03e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_fragmentation.c @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include + +#include "ip_frag_common.h" + +/* Fragment Offset */ +#define IPV4_HDR_DF_SHIFT 14 +#define IPV4_HDR_MF_SHIFT 13 +#define IPV4_HDR_FO_SHIFT 3 + +#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT) +#define IPV4_HDR_MF_MASK (1 << IPV4_HDR_MF_SHIFT) + +#define IPV4_HDR_FO_ALIGN (1 << IPV4_HDR_FO_SHIFT) + +static inline void __fill_ipv4hdr_frag(struct ipv4_hdr *dst, + const struct ipv4_hdr *src, uint16_t len, uint16_t fofs, + uint16_t dofs, uint32_t mf) +{ + rte_memcpy(dst, src, sizeof(*dst)); + fofs = (uint16_t)(fofs + (dofs >> IPV4_HDR_FO_SHIFT)); + fofs = (uint16_t)(fofs | mf << IPV4_HDR_MF_SHIFT); + dst->fragment_offset = rte_cpu_to_be_16(fofs); + dst->total_length = rte_cpu_to_be_16(len); + dst->hdr_checksum = 0; +} + +static inline void __free_fragments(struct rte_mbuf *mb[], uint32_t num) +{ + uint32_t i; + for (i = 0; i != num; i++) + rte_pktmbuf_free(mb[i]); +} + +/** + * IPv4 fragmentation. + * + * This function implements the fragmentation of IPv4 packets. + * + * @param pkt_in + * The input packet. + * @param pkts_out + * Array storing the output fragments. + * @param mtu_size + * Size in bytes of the Maximum Transfer Unit (MTU) for the outgoing IPv4 + * datagrams. This value includes the size of the IPv4 header. + * @param pool_direct + * MBUF pool used for allocating direct buffers for the output fragments. + * @param pool_indirect + * MBUF pool used for allocating indirect buffers for the output fragments. + * @return + * Upon successful completion - number of output fragments placed + * in the pkts_out array. + * Otherwise - (-1) * . + */ +int32_t +rte_ipv4_fragment_packet(struct rte_mbuf *pkt_in, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out, + uint16_t mtu_size, + struct rte_mempool *pool_direct, + struct rte_mempool *pool_indirect) +{ + struct rte_mbuf *in_seg = NULL; + struct ipv4_hdr *in_hdr; + uint32_t out_pkt_pos, in_seg_data_pos; + uint32_t more_in_segs; + uint16_t fragment_offset, flag_offset, frag_size; + uint16_t frag_bytes_remaining; + + /* + * Ensure the IP payload length of all fragments is aligned to a + * multiple of 8 bytes as per RFC791 section 2.3. + */ + frag_size = RTE_ALIGN_FLOOR((mtu_size - sizeof(struct ipv4_hdr)), + IPV4_HDR_FO_ALIGN); + + in_hdr = rte_pktmbuf_mtod(pkt_in, struct ipv4_hdr *); + flag_offset = rte_cpu_to_be_16(in_hdr->fragment_offset); + + /* If Don't Fragment flag is set */ + if (unlikely ((flag_offset & IPV4_HDR_DF_MASK) != 0)) + return -ENOTSUP; + + /* Check that pkts_out is big enough to hold all fragments */ + if (unlikely(frag_size * nb_pkts_out < + (uint16_t)(pkt_in->pkt_len - sizeof (struct ipv4_hdr)))) + return -EINVAL; + + in_seg = pkt_in; + in_seg_data_pos = sizeof(struct ipv4_hdr); + out_pkt_pos = 0; + fragment_offset = 0; + + more_in_segs = 1; + while (likely(more_in_segs)) { + struct rte_mbuf *out_pkt = NULL, *out_seg_prev = NULL; + uint32_t more_out_segs; + struct ipv4_hdr *out_hdr; + + /* Allocate direct buffer */ + out_pkt = rte_pktmbuf_alloc(pool_direct); + if (unlikely(out_pkt == NULL)) { + __free_fragments(pkts_out, out_pkt_pos); + return -ENOMEM; + } + + /* Reserve space for the IP header that will be built later */ + out_pkt->data_len = sizeof(struct ipv4_hdr); + out_pkt->pkt_len = sizeof(struct ipv4_hdr); + frag_bytes_remaining = frag_size; + + out_seg_prev = out_pkt; + more_out_segs = 1; + while (likely(more_out_segs && more_in_segs)) { + struct rte_mbuf *out_seg = NULL; + uint32_t len; + + /* Allocate indirect buffer */ + out_seg = rte_pktmbuf_alloc(pool_indirect); + if (unlikely(out_seg == NULL)) { + rte_pktmbuf_free(out_pkt); + __free_fragments(pkts_out, out_pkt_pos); + return -ENOMEM; + } + out_seg_prev->next = out_seg; + out_seg_prev = out_seg; + + /* Prepare indirect buffer */ + rte_pktmbuf_attach(out_seg, in_seg); + len = frag_bytes_remaining; + if (len > (in_seg->data_len - in_seg_data_pos)) { + len = in_seg->data_len - in_seg_data_pos; + } + out_seg->data_off = in_seg->data_off + in_seg_data_pos; + out_seg->data_len = (uint16_t)len; + out_pkt->pkt_len = (uint16_t)(len + + out_pkt->pkt_len); + out_pkt->nb_segs += 1; + in_seg_data_pos += len; + frag_bytes_remaining -= len; + + /* Current output packet (i.e. fragment) done ? */ + if (unlikely(frag_bytes_remaining == 0)) + more_out_segs = 0; + + /* Current input segment done ? */ + if (unlikely(in_seg_data_pos == in_seg->data_len)) { + in_seg = in_seg->next; + in_seg_data_pos = 0; + + if (unlikely(in_seg == NULL)) + more_in_segs = 0; + } + } + + /* Build the IP header */ + + out_hdr = rte_pktmbuf_mtod(out_pkt, struct ipv4_hdr *); + + __fill_ipv4hdr_frag(out_hdr, in_hdr, + (uint16_t)out_pkt->pkt_len, + flag_offset, fragment_offset, more_in_segs); + + fragment_offset = (uint16_t)(fragment_offset + + out_pkt->pkt_len - sizeof(struct ipv4_hdr)); + + out_pkt->ol_flags |= PKT_TX_IP_CKSUM; + out_pkt->l3_len = sizeof(struct ipv4_hdr); + + /* Write the fragment to the output list */ + pkts_out[out_pkt_pos] = out_pkt; + out_pkt_pos ++; + } + + return out_pkt_pos; +} diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_reassembly.c b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_reassembly.c new file mode 100644 index 00000000..4956b99e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv4_reassembly.c @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include + +#include "ip_frag_common.h" + +/* + * Reassemble fragments into one packet. + */ +struct rte_mbuf * +ipv4_frag_reassemble(struct ip_frag_pkt *fp) +{ + struct ipv4_hdr *ip_hdr; + struct rte_mbuf *m, *prev; + uint32_t i, n, ofs, first_len; + uint32_t curr_idx = 0; + + first_len = fp->frags[IP_FIRST_FRAG_IDX].len; + n = fp->last_idx - 1; + + /*start from the last fragment. */ + m = fp->frags[IP_LAST_FRAG_IDX].mb; + ofs = fp->frags[IP_LAST_FRAG_IDX].ofs; + curr_idx = IP_LAST_FRAG_IDX; + + while (ofs != first_len) { + + prev = m; + + for (i = n; i != IP_FIRST_FRAG_IDX && ofs != first_len; i--) { + + /* previous fragment found. */ + if(fp->frags[i].ofs + fp->frags[i].len == ofs) { + + /* adjust start of the last fragment data. */ + rte_pktmbuf_adj(m, (uint16_t)(m->l2_len + m->l3_len)); + rte_pktmbuf_chain(fp->frags[i].mb, m); + + /* this mbuf should not be accessed directly */ + fp->frags[curr_idx].mb = NULL; + curr_idx = i; + + /* update our last fragment and offset. */ + m = fp->frags[i].mb; + ofs = fp->frags[i].ofs; + } + } + + /* error - hole in the packet. */ + if (m == prev) { + return NULL; + } + } + + /* chain with the first fragment. */ + rte_pktmbuf_adj(m, (uint16_t)(m->l2_len + m->l3_len)); + rte_pktmbuf_chain(fp->frags[IP_FIRST_FRAG_IDX].mb, m); + fp->frags[curr_idx].mb = NULL; + m = fp->frags[IP_FIRST_FRAG_IDX].mb; + fp->frags[IP_FIRST_FRAG_IDX].mb = NULL; + + /* update mbuf fields for reassembled packet. */ + m->ol_flags |= PKT_TX_IP_CKSUM; + + /* update ipv4 header for the reassembled packet */ + ip_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len); + + ip_hdr->total_length = rte_cpu_to_be_16((uint16_t)(fp->total_size + + m->l3_len)); + ip_hdr->fragment_offset = (uint16_t)(ip_hdr->fragment_offset & + rte_cpu_to_be_16(IPV4_HDR_DF_FLAG)); + ip_hdr->hdr_checksum = 0; + + return m; +} + +/* + * Process new mbuf with fragment of IPV4 packet. + * Incoming mbuf should have it's l2_len/l3_len fields setuped correclty. + * @param tbl + * Table where to lookup/add the fragmented packet. + * @param mb + * Incoming mbuf with IPV4 fragment. + * @param tms + * Fragment arrival timestamp. + * @param ip_hdr + * Pointer to the IPV4 header inside the fragment. + * @return + * Pointer to mbuf for reassembled packet, or NULL if: + * - an error occurred. + * - not all fragments of the packet are collected yet. + */ +struct rte_mbuf * +rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, struct rte_mbuf *mb, uint64_t tms, + struct ipv4_hdr *ip_hdr) +{ + struct ip_frag_pkt *fp; + struct ip_frag_key key; + const unaligned_uint64_t *psd; + uint16_t ip_len; + uint16_t flag_offset, ip_ofs, ip_flag; + + flag_offset = rte_be_to_cpu_16(ip_hdr->fragment_offset); + ip_ofs = (uint16_t)(flag_offset & IPV4_HDR_OFFSET_MASK); + ip_flag = (uint16_t)(flag_offset & IPV4_HDR_MF_FLAG); + + psd = (unaligned_uint64_t *)&ip_hdr->src_addr; + /* use first 8 bytes only */ + key.src_dst[0] = psd[0]; + key.id = ip_hdr->packet_id; + key.key_len = IPV4_KEYLEN; + + ip_ofs *= IPV4_HDR_OFFSET_UNITS; + ip_len = (uint16_t)(rte_be_to_cpu_16(ip_hdr->total_length) - + mb->l3_len); + + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "mbuf: %p, tms: %" PRIu64 + ", key: <%" PRIx64 ", %#x>, ofs: %u, len: %u, flags: %#x\n" + "tbl: %p, max_cycles: %" PRIu64 ", entry_mask: %#x, " + "max_entries: %u, use_entries: %u\n\n", + __func__, __LINE__, + mb, tms, key.src_dst[0], key.id, ip_ofs, ip_len, ip_flag, + tbl, tbl->max_cycles, tbl->entry_mask, tbl->max_entries, + tbl->use_entries); + + /* try to find/add entry into the fragment's table. */ + if ((fp = ip_frag_find(tbl, dr, &key, tms)) == NULL) { + IP_FRAG_MBUF2DR(dr, mb); + return NULL; + } + + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv4_frag_pkt: %p, key: <%" PRIx64 ", %#x>, start: %" PRIu64 + ", total_size: %u, frag_size: %u, last_idx: %u\n\n", + __func__, __LINE__, + tbl, tbl->max_entries, tbl->use_entries, + fp, fp->key.src_dst[0], fp->key.id, fp->start, + fp->total_size, fp->frag_size, fp->last_idx); + + + /* process the fragmented packet. */ + mb = ip_frag_process(fp, dr, mb, ip_ofs, ip_len, ip_flag); + ip_frag_inuse(tbl, fp); + + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "mbuf: %p\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv4_frag_pkt: %p, key: <%" PRIx64 ", %#x>, start: %" PRIu64 + ", total_size: %u, frag_size: %u, last_idx: %u\n\n", + __func__, __LINE__, mb, + tbl, tbl->max_entries, tbl->use_entries, + fp, fp->key.src_dst[0], fp->key.id, fp->start, + fp->total_size, fp->frag_size, fp->last_idx); + + return mb; +} diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_fragmentation.c b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_fragmentation.c new file mode 100644 index 00000000..62a7e4e8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_fragmentation.c @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include + +#include "ip_frag_common.h" + +/** + * @file + * RTE IPv6 Fragmentation + * + * Implementation of IPv6 fragmentation. + * + */ + +static inline void +__fill_ipv6hdr_frag(struct ipv6_hdr *dst, + const struct ipv6_hdr *src, uint16_t len, uint16_t fofs, + uint32_t mf) +{ + struct ipv6_extension_fragment *fh; + + rte_memcpy(dst, src, sizeof(*dst)); + dst->payload_len = rte_cpu_to_be_16(len); + dst->proto = IPPROTO_FRAGMENT; + + fh = (struct ipv6_extension_fragment *) ++dst; + fh->next_header = src->proto; + fh->reserved = 0; + fh->frag_data = rte_cpu_to_be_16(RTE_IPV6_SET_FRAG_DATA(fofs, mf)); + fh->id = 0; +} + +static inline void +__free_fragments(struct rte_mbuf *mb[], uint32_t num) +{ + uint32_t i; + for (i = 0; i < num; i++) + rte_pktmbuf_free(mb[i]); +} + +/** + * IPv6 fragmentation. + * + * This function implements the fragmentation of IPv6 packets. + * + * @param pkt_in + * The input packet. + * @param pkts_out + * Array storing the output fragments. + * @param mtu_size + * Size in bytes of the Maximum Transfer Unit (MTU) for the outgoing IPv6 + * datagrams. This value includes the size of the IPv6 header. + * @param pool_direct + * MBUF pool used for allocating direct buffers for the output fragments. + * @param pool_indirect + * MBUF pool used for allocating indirect buffers for the output fragments. + * @return + * Upon successful completion - number of output fragments placed + * in the pkts_out array. + * Otherwise - (-1) * . + */ +int32_t +rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out, + uint16_t mtu_size, + struct rte_mempool *pool_direct, + struct rte_mempool *pool_indirect) +{ + struct rte_mbuf *in_seg = NULL; + struct ipv6_hdr *in_hdr; + uint32_t out_pkt_pos, in_seg_data_pos; + uint32_t more_in_segs; + uint16_t fragment_offset, frag_size; + + frag_size = (uint16_t)(mtu_size - sizeof(struct ipv6_hdr)); + + /* Fragment size should be a multiple of 8. */ + RTE_ASSERT((frag_size & ~RTE_IPV6_EHDR_FO_MASK) == 0); + + /* Check that pkts_out is big enough to hold all fragments */ + if (unlikely (frag_size * nb_pkts_out < + (uint16_t)(pkt_in->pkt_len - sizeof (struct ipv6_hdr)))) + return -EINVAL; + + in_hdr = rte_pktmbuf_mtod(pkt_in, struct ipv6_hdr *); + + in_seg = pkt_in; + in_seg_data_pos = sizeof(struct ipv6_hdr); + out_pkt_pos = 0; + fragment_offset = 0; + + more_in_segs = 1; + while (likely(more_in_segs)) { + struct rte_mbuf *out_pkt = NULL, *out_seg_prev = NULL; + uint32_t more_out_segs; + struct ipv6_hdr *out_hdr; + + /* Allocate direct buffer */ + out_pkt = rte_pktmbuf_alloc(pool_direct); + if (unlikely(out_pkt == NULL)) { + __free_fragments(pkts_out, out_pkt_pos); + return -ENOMEM; + } + + /* Reserve space for the IP header that will be built later */ + out_pkt->data_len = sizeof(struct ipv6_hdr) + sizeof(struct ipv6_extension_fragment); + out_pkt->pkt_len = sizeof(struct ipv6_hdr) + sizeof(struct ipv6_extension_fragment); + + out_seg_prev = out_pkt; + more_out_segs = 1; + while (likely(more_out_segs && more_in_segs)) { + struct rte_mbuf *out_seg = NULL; + uint32_t len; + + /* Allocate indirect buffer */ + out_seg = rte_pktmbuf_alloc(pool_indirect); + if (unlikely(out_seg == NULL)) { + rte_pktmbuf_free(out_pkt); + __free_fragments(pkts_out, out_pkt_pos); + return -ENOMEM; + } + out_seg_prev->next = out_seg; + out_seg_prev = out_seg; + + /* Prepare indirect buffer */ + rte_pktmbuf_attach(out_seg, in_seg); + len = mtu_size - out_pkt->pkt_len; + if (len > (in_seg->data_len - in_seg_data_pos)) { + len = in_seg->data_len - in_seg_data_pos; + } + out_seg->data_off = in_seg->data_off + in_seg_data_pos; + out_seg->data_len = (uint16_t)len; + out_pkt->pkt_len = (uint16_t)(len + + out_pkt->pkt_len); + out_pkt->nb_segs += 1; + in_seg_data_pos += len; + + /* Current output packet (i.e. fragment) done ? */ + if (unlikely(out_pkt->pkt_len >= mtu_size)) { + more_out_segs = 0; + } + + /* Current input segment done ? */ + if (unlikely(in_seg_data_pos == in_seg->data_len)) { + in_seg = in_seg->next; + in_seg_data_pos = 0; + + if (unlikely(in_seg == NULL)) { + more_in_segs = 0; + } + } + } + + /* Build the IP header */ + + out_hdr = rte_pktmbuf_mtod(out_pkt, struct ipv6_hdr *); + + __fill_ipv6hdr_frag(out_hdr, in_hdr, + (uint16_t) out_pkt->pkt_len - sizeof(struct ipv6_hdr), + fragment_offset, more_in_segs); + + fragment_offset = (uint16_t)(fragment_offset + + out_pkt->pkt_len - sizeof(struct ipv6_hdr) + - sizeof(struct ipv6_extension_fragment)); + + /* Write the fragment to the output list */ + pkts_out[out_pkt_pos] = out_pkt; + out_pkt_pos ++; + } + + return out_pkt_pos; +} diff --git a/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_reassembly.c b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_reassembly.c new file mode 100644 index 00000000..db249fe6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ip_frag/rte_ipv6_reassembly.c @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include + +#include "ip_frag_common.h" + +/** + * @file + * IPv6 reassemble + * + * Implementation of IPv6 reassembly. + * + */ + +static inline void +ip_frag_memmove(char *dst, char *src, int len) +{ + int i; + + /* go backwards to make sure we don't overwrite anything important */ + for (i = len - 1; i >= 0; i--) + dst[i] = src[i]; +} + +/* + * Reassemble fragments into one packet. + */ +struct rte_mbuf * +ipv6_frag_reassemble(struct ip_frag_pkt *fp) +{ + struct ipv6_hdr *ip_hdr; + struct ipv6_extension_fragment *frag_hdr; + struct rte_mbuf *m, *prev; + uint32_t i, n, ofs, first_len; + uint32_t last_len, move_len, payload_len; + uint32_t curr_idx = 0; + + first_len = fp->frags[IP_FIRST_FRAG_IDX].len; + n = fp->last_idx - 1; + + /*start from the last fragment. */ + m = fp->frags[IP_LAST_FRAG_IDX].mb; + ofs = fp->frags[IP_LAST_FRAG_IDX].ofs; + last_len = fp->frags[IP_LAST_FRAG_IDX].len; + curr_idx = IP_LAST_FRAG_IDX; + + payload_len = ofs + last_len; + + while (ofs != first_len) { + + prev = m; + + for (i = n; i != IP_FIRST_FRAG_IDX && ofs != first_len; i--) { + + /* previous fragment found. */ + if (fp->frags[i].ofs + fp->frags[i].len == ofs) { + + /* adjust start of the last fragment data. */ + rte_pktmbuf_adj(m, (uint16_t)(m->l2_len + m->l3_len)); + rte_pktmbuf_chain(fp->frags[i].mb, m); + + /* this mbuf should not be accessed directly */ + fp->frags[curr_idx].mb = NULL; + curr_idx = i; + + /* update our last fragment and offset. */ + m = fp->frags[i].mb; + ofs = fp->frags[i].ofs; + } + } + + /* error - hole in the packet. */ + if (m == prev) { + return NULL; + } + } + + /* chain with the first fragment. */ + rte_pktmbuf_adj(m, (uint16_t)(m->l2_len + m->l3_len)); + rte_pktmbuf_chain(fp->frags[IP_FIRST_FRAG_IDX].mb, m); + fp->frags[curr_idx].mb = NULL; + m = fp->frags[IP_FIRST_FRAG_IDX].mb; + fp->frags[IP_FIRST_FRAG_IDX].mb = NULL; + + /* update mbuf fields for reassembled packet. */ + m->ol_flags |= PKT_TX_IP_CKSUM; + + /* update ipv6 header for the reassembled datagram */ + ip_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, m->l2_len); + + ip_hdr->payload_len = rte_cpu_to_be_16(payload_len); + + /* + * remove fragmentation header. note that per RFC2460, we need to update + * the last non-fragmentable header with the "next header" field to contain + * type of the first fragmentable header, but we currently don't support + * other headers, so we assume there are no other headers and thus update + * the main IPv6 header instead. + */ + move_len = m->l2_len + m->l3_len - sizeof(*frag_hdr); + frag_hdr = (struct ipv6_extension_fragment *) (ip_hdr + 1); + ip_hdr->proto = frag_hdr->next_header; + + ip_frag_memmove(rte_pktmbuf_mtod_offset(m, char *, sizeof(*frag_hdr)), + rte_pktmbuf_mtod(m, char*), move_len); + + rte_pktmbuf_adj(m, sizeof(*frag_hdr)); + + return m; +} + +/* + * Process new mbuf with fragment of IPV6 datagram. + * Incoming mbuf should have its l2_len/l3_len fields setup correctly. + * @param tbl + * Table where to lookup/add the fragmented packet. + * @param mb + * Incoming mbuf with IPV6 fragment. + * @param tms + * Fragment arrival timestamp. + * @param ip_hdr + * Pointer to the IPV6 header. + * @param frag_hdr + * Pointer to the IPV6 fragment extension header. + * @return + * Pointer to mbuf for reassembled packet, or NULL if: + * - an error occurred. + * - not all fragments of the packet are collected yet. + */ +#define MORE_FRAGS(x) (((x) & 0x100) >> 8) +#define FRAG_OFFSET(x) (rte_cpu_to_be_16(x) >> 3) +struct rte_mbuf * +rte_ipv6_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl, + struct rte_ip_frag_death_row *dr, struct rte_mbuf *mb, uint64_t tms, + struct ipv6_hdr *ip_hdr, struct ipv6_extension_fragment *frag_hdr) +{ + struct ip_frag_pkt *fp; + struct ip_frag_key key; + uint16_t ip_len, ip_ofs; + + rte_memcpy(&key.src_dst[0], ip_hdr->src_addr, 16); + rte_memcpy(&key.src_dst[2], ip_hdr->dst_addr, 16); + + key.id = frag_hdr->id; + key.key_len = IPV6_KEYLEN; + + ip_ofs = FRAG_OFFSET(frag_hdr->frag_data) * 8; + + /* + * as per RFC2460, payload length contains all extension headers as well. + * since we don't support anything but frag headers, this is what we remove + * from the payload len. + */ + ip_len = rte_be_to_cpu_16(ip_hdr->payload_len) - sizeof(*frag_hdr); + + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "mbuf: %p, tms: %" PRIu64 + ", key: <" IPv6_KEY_BYTES_FMT ", %#x>, ofs: %u, len: %u, flags: %#x\n" + "tbl: %p, max_cycles: %" PRIu64 ", entry_mask: %#x, " + "max_entries: %u, use_entries: %u\n\n", + __func__, __LINE__, + mb, tms, IPv6_KEY_BYTES(key.src_dst), key.id, ip_ofs, ip_len, + RTE_IPV6_GET_MF(frag_hdr->frag_data), + tbl, tbl->max_cycles, tbl->entry_mask, tbl->max_entries, + tbl->use_entries); + + /* try to find/add entry into the fragment's table. */ + fp = ip_frag_find(tbl, dr, &key, tms); + if (fp == NULL) { + IP_FRAG_MBUF2DR(dr, mb); + return NULL; + } + + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv6_frag_pkt: %p, key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 + ", total_size: %u, frag_size: %u, last_idx: %u\n\n", + __func__, __LINE__, + tbl, tbl->max_entries, tbl->use_entries, + fp, IPv6_KEY_BYTES(fp->key.src_dst), fp->key.id, fp->start, + fp->total_size, fp->frag_size, fp->last_idx); + + + /* process the fragmented packet. */ + mb = ip_frag_process(fp, dr, mb, ip_ofs, ip_len, + MORE_FRAGS(frag_hdr->frag_data)); + ip_frag_inuse(tbl, fp); + + IP_FRAG_LOG(DEBUG, "%s:%d:\n" + "mbuf: %p\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv6_frag_pkt: %p, key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 + ", total_size: %u, frag_size: %u, last_idx: %u\n\n", + __func__, __LINE__, mb, + tbl, tbl->max_entries, tbl->use_entries, + fp, IPv6_KEY_BYTES(fp->key.src_dst), fp->key.id, fp->start, + fp->total_size, fp->frag_size, fp->last_idx); + + return mb; +} diff --git a/src/spdk/dpdk/lib/librte_jobstats/Makefile b/src/spdk/dpdk/lib/librte_jobstats/Makefile new file mode 100644 index 00000000..35e788f7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_jobstats/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2015 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_jobstats.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_jobstats_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_JOBSTATS) := rte_jobstats.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_JOBSTATS)-include := rte_jobstats.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_jobstats/meson.build b/src/spdk/dpdk/lib/librte_jobstats/meson.build new file mode 100644 index 00000000..391e4f80 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_jobstats/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_jobstats.c') +headers = files('rte_jobstats.h') diff --git a/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.c b/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.c new file mode 100644 index 00000000..b64bc53b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.c @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "rte_jobstats.h" + +#define ADD_TIME_MIN_MAX(obj, type, value) do { \ + typeof(value) tmp = (value); \ + (obj)->type ## _time += tmp; \ + if (tmp < (obj)->min_ ## type ## _time) \ + (obj)->min_ ## type ## _time = tmp; \ + if (tmp > (obj)->max_ ## type ## _time) \ + (obj)->max_ ## type ## _time = tmp; \ +} while (0) + +#define RESET_TIME_MIN_MAX(obj, type) do { \ + (obj)->type ## _time = 0; \ + (obj)->min_ ## type ## _time = UINT64_MAX; \ + (obj)->max_ ## type ## _time = 0; \ +} while (0) + +static inline uint64_t +get_time(void) +{ + rte_rmb(); + return rte_get_timer_cycles(); +} + +/* Those are steps used to adjust job period. + * Experiments show that for forwarding apps the up step must be less than down + * step to achieve optimal performance. + */ +#define JOB_UPDATE_STEP_UP 1 +#define JOB_UPDATE_STEP_DOWN 4 + +/* + * Default update function that implements simple period adjustment. + */ +static void +default_update_function(struct rte_jobstats *job, int64_t result) +{ + int64_t err = job->target - result; + + /* Job is happy. Nothing to do */ + if (err == 0) + return; + + if (err > 0) { + if (job->period + JOB_UPDATE_STEP_UP < job->max_period) + job->period += JOB_UPDATE_STEP_UP; + } else { + if (job->min_period + JOB_UPDATE_STEP_DOWN < job->period) + job->period -= JOB_UPDATE_STEP_DOWN; + } +} + +int +rte_jobstats_context_init(struct rte_jobstats_context *ctx) +{ + if (ctx == NULL) + return -EINVAL; + + /* Init only needed parameters. Zero out everything else. */ + memset(ctx, 0, sizeof(struct rte_jobstats_context)); + + rte_jobstats_context_reset(ctx); + + return 0; +} + +void +rte_jobstats_context_start(struct rte_jobstats_context *ctx) +{ + uint64_t now; + + ctx->loop_executed_jobs = 0; + + now = get_time(); + ADD_TIME_MIN_MAX(ctx, management, now - ctx->state_time); + ctx->state_time = now; +} + +void +rte_jobstats_context_finish(struct rte_jobstats_context *ctx) +{ + uint64_t now; + + if (likely(ctx->loop_executed_jobs)) + ctx->loop_cnt++; + + now = get_time(); + ADD_TIME_MIN_MAX(ctx, management, now - ctx->state_time); + ctx->state_time = now; +} + +void +rte_jobstats_context_reset(struct rte_jobstats_context *ctx) +{ + RESET_TIME_MIN_MAX(ctx, exec); + RESET_TIME_MIN_MAX(ctx, management); + ctx->start_time = get_time(); + ctx->state_time = ctx->start_time; + ctx->job_exec_cnt = 0; + ctx->loop_cnt = 0; +} + +void +rte_jobstats_set_target(struct rte_jobstats *job, int64_t target) +{ + job->target = target; +} + +int +rte_jobstats_start(struct rte_jobstats_context *ctx, struct rte_jobstats *job) +{ + uint64_t now; + + /* Some sanity check. */ + if (unlikely(ctx == NULL || job == NULL || job->context != NULL)) + return -EINVAL; + + /* Link job with context object. */ + job->context = ctx; + + now = get_time(); + ADD_TIME_MIN_MAX(ctx, management, now - ctx->state_time); + ctx->state_time = now; + + return 0; +} + +int +rte_jobstats_abort(struct rte_jobstats *job) +{ + struct rte_jobstats_context *ctx; + uint64_t now, exec_time; + + /* Some sanity check. */ + if (unlikely(job == NULL || job->context == NULL)) + return -EINVAL; + + ctx = job->context; + now = get_time(); + exec_time = now - ctx->state_time; + ADD_TIME_MIN_MAX(ctx, management, exec_time); + ctx->state_time = now; + job->context = NULL; + + return 0; +} + +int +rte_jobstats_finish(struct rte_jobstats *job, int64_t job_value) +{ + struct rte_jobstats_context *ctx; + uint64_t now, exec_time; + int need_update; + + /* Some sanity check. */ + if (unlikely(job == NULL || job->context == NULL)) + return -EINVAL; + + need_update = job->target != job_value; + /* Adjust period only if job is unhappy of its current period. */ + if (need_update) + (*job->update_period_cb)(job, job_value); + + ctx = job->context; + + /* Update execution time is considered as runtime so get time after it is + * executed. */ + now = get_time(); + exec_time = now - ctx->state_time; + ADD_TIME_MIN_MAX(job, exec, exec_time); + ADD_TIME_MIN_MAX(ctx, exec, exec_time); + + ctx->state_time = now; + + ctx->loop_executed_jobs++; + ctx->job_exec_cnt++; + + job->exec_cnt++; + job->context = NULL; + + return need_update; +} + +void +rte_jobstats_set_period(struct rte_jobstats *job, uint64_t period, + uint8_t saturate) +{ + if (saturate != 0) { + if (period < job->min_period) + period = job->min_period; + else if (period > job->max_period) + period = job->max_period; + } + + job->period = period; +} + +void +rte_jobstats_set_min(struct rte_jobstats *job, uint64_t period) +{ + job->min_period = period; + if (job->period < period) + job->period = period; +} + +void +rte_jobstats_set_max(struct rte_jobstats *job, uint64_t period) +{ + job->max_period = period; + if (job->period > period) + job->period = period; +} + +int +rte_jobstats_init(struct rte_jobstats *job, const char *name, + uint64_t min_period, uint64_t max_period, uint64_t initial_period, + int64_t target) +{ + if (job == NULL) + return -EINVAL; + + job->period = initial_period; + job->min_period = min_period; + job->max_period = max_period; + job->target = target; + job->update_period_cb = &default_update_function; + rte_jobstats_reset(job); + snprintf(job->name, RTE_DIM(job->name), "%s", name == NULL ? "" : name); + job->context = NULL; + + return 0; +} + +void +rte_jobstats_set_update_period_function(struct rte_jobstats *job, + rte_job_update_period_cb_t update_period_cb) +{ + if (update_period_cb == NULL) + update_period_cb = default_update_function; + + job->update_period_cb = update_period_cb; +} + +void +rte_jobstats_reset(struct rte_jobstats *job) +{ + RESET_TIME_MIN_MAX(job, exec); + job->exec_cnt = 0; +} diff --git a/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.h b/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.h new file mode 100644 index 00000000..023d9dd1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 Intel Corporation + */ + +#ifndef JOBSTATS_H_ +#define JOBSTATS_H_ + +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_JOBSTATS_NAMESIZE 32 + +/* Forward declarations. */ +struct rte_jobstats_context; +struct rte_jobstats; + +/** + * This function should calculate new period and set it using + * rte_jobstats_set_period() function. Time spent in this function will be + * added to job's runtime. + * + * @param job + * The job data structure handler. + * @param job_result + * Result of calling job callback. + */ +typedef void (*rte_job_update_period_cb_t)(struct rte_jobstats *job, + int64_t job_result); + +struct rte_jobstats { + uint64_t period; + /**< Estimated period of execution. */ + + uint64_t min_period; + /**< Minimum period. */ + + uint64_t max_period; + /**< Maximum period. */ + + int64_t target; + /**< Desired value for this job. */ + + rte_job_update_period_cb_t update_period_cb; + /**< Period update callback. */ + + uint64_t exec_time; + /**< Total time (sum) that this job was executing. */ + + uint64_t min_exec_time; + /**< Minimum execute time. */ + + uint64_t max_exec_time; + /**< Maximum execute time. */ + + uint64_t exec_cnt; + /**< Execute count. */ + + char name[RTE_JOBSTATS_NAMESIZE]; + /**< Name of this job */ + + struct rte_jobstats_context *context; + /**< Job stats context object that is executing this job. */ +} __rte_cache_aligned; + +struct rte_jobstats_context { + /** Variable holding time at different points: + * -# loop start time if loop was started but no job executed yet. + * -# job start time if job is currently executing. + * -# job finish time if job finished its execution. + * -# loop finish time if loop finished its execution. */ + uint64_t state_time; + + uint64_t loop_executed_jobs; + /**< Count of executed jobs in this loop. */ + + /* Statistics start. */ + + uint64_t exec_time; + /**< Total time taken to execute jobs, not including management time. */ + + uint64_t min_exec_time; + /**< Minimum loop execute time. */ + + uint64_t max_exec_time; + /**< Maximum loop execute time. */ + + /** + * Sum of time that is not the execute time (ex: from job finish to next + * job start). + * + * This time might be considered as overhead of library + job scheduling. + */ + uint64_t management_time; + + uint64_t min_management_time; + /**< Minimum management time */ + + uint64_t max_management_time; + /**< Maximum management time */ + + uint64_t start_time; + /**< Time since last reset stats. */ + + uint64_t job_exec_cnt; + /**< Total count of executed jobs. */ + + uint64_t loop_cnt; + /**< Total count of executed loops with at least one executed job. */ +} __rte_cache_aligned; + +/** + * Initialize given context object with default values. + * + * @param ctx + * Job stats context object to initialize. + * + * @return + * 0 on success + * -EINVAL if *ctx* is NULL + */ +int +rte_jobstats_context_init(struct rte_jobstats_context *ctx); + +/** + * Mark that new set of jobs start executing. + * + * @param ctx + * Job stats context object. + */ +void +rte_jobstats_context_start(struct rte_jobstats_context *ctx); + +/** + * Mark that there is no more jobs ready to execute in this turn. Calculate + * stats for this loop turn. + * + * @param ctx + * Job stats context. + */ +void +rte_jobstats_context_finish(struct rte_jobstats_context *ctx); + +/** + * Function resets job context statistics. + * + * @param ctx + * Job stats context which statistics will be reset. + */ +void +rte_jobstats_context_reset(struct rte_jobstats_context *ctx); + +/** + * Initialize given job stats object. + * + * @param job + * Job object. + * @param name + * Optional job name. + * @param min_period + * Minimum period that this job can accept. + * @param max_period + * Maximum period that this job can accept. + * @param initial_period + * Initial period. It will be checked against *min_period* and *max_period*. + * @param target + * Target value that this job try to achieve. + * + * @return + * 0 on success + * -EINVAL if *job* is NULL + */ +int +rte_jobstats_init(struct rte_jobstats *job, const char *name, + uint64_t min_period, uint64_t max_period, uint64_t initial_period, + int64_t target); + +/** + * Set job desired target value. Difference between target and job value + * value must be used to properly adjust job execute period value. + * + * @param job + * The job object. + * @param target + * New target. + */ +void +rte_jobstats_set_target(struct rte_jobstats *job, int64_t target); + +/** + * Mark that *job* is starting of its execution in context of *ctx* object. + * + * @param ctx + * Job stats context. + * @param job + * Job object. + * @return + * 0 on success + * -EINVAL if *ctx* or *job* is NULL or *job* is executing in another context + * context already, + */ +int +rte_jobstats_start(struct rte_jobstats_context *ctx, struct rte_jobstats *job); + +/** + * Mark that *job* finished its execution, but time of this work will be skipped + * and added to management time. + * + * @param job + * Job object. + * + * @return + * 0 on success + * -EINVAL if job is NULL or job was not started (it have no context). + */ +int +rte_jobstats_abort(struct rte_jobstats *job); + +/** + * Mark that *job* finished its execution. Context in which it was executing + * will receive stat update. After this function call *job* object is ready to + * be executed in other context. + * + * @param job + * Job object. + * @param job_value + * Job value. Job should pass in this parameter a value that it try to optimize + * for example the number of packets it processed. + * + * @return + * 0 if job's period was not updated (job target equals *job_value*) + * 1 if job's period was updated + * -EINVAL if job is NULL or job was not started (it have no context). + */ +int +rte_jobstats_finish(struct rte_jobstats *job, int64_t job_value); + +/** + * Set execute period of given job. + * + * @param job + * The job object. + * @param period + * New period value. + * @param saturate + * If zero, skip period saturation to min, max range. + */ +void +rte_jobstats_set_period(struct rte_jobstats *job, uint64_t period, + uint8_t saturate); +/** + * Set minimum execute period of given job. Current period will be checked + * against new minimum value. + * + * @param job + * The job object. + * @param period + * New minimum period value. + */ +void +rte_jobstats_set_min(struct rte_jobstats *job, uint64_t period); +/** + * Set maximum execute period of given job. Current period will be checked + * against new maximum value. + * + * @param job + * The job object. + * @param period + * New maximum period value. + */ +void +rte_jobstats_set_max(struct rte_jobstats *job, uint64_t period); + +/** + * Set update period callback that is invoked after job finish. + * + * If application wants to do more sophisticated calculations than default + * it can provide this handler. + * + * @param job + * Job object. + * @param update_period_cb + * Callback to set. If NULL restore default update function. + */ +void +rte_jobstats_set_update_period_function(struct rte_jobstats *job, + rte_job_update_period_cb_t update_period_cb); + +/** + * Function resets job statistics. + * + * @param job + * Job which statistics will be reset. + */ +void +rte_jobstats_reset(struct rte_jobstats *job); + +#ifdef __cplusplus +} +#endif + +#endif /* JOBSTATS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats_version.map b/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats_version.map new file mode 100644 index 00000000..f8944143 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_jobstats/rte_jobstats_version.map @@ -0,0 +1,26 @@ +DPDK_2.0 { + global: + + rte_jobstats_context_finish; + rte_jobstats_context_init; + rte_jobstats_context_reset; + rte_jobstats_context_start; + rte_jobstats_finish; + rte_jobstats_init; + rte_jobstats_reset; + rte_jobstats_set_max; + rte_jobstats_set_min; + rte_jobstats_set_period; + rte_jobstats_set_target; + rte_jobstats_set_update_period_function; + rte_jobstats_start; + + local: *; +}; + +DPDK_16.04 { + global: + + rte_jobstats_abort; + +} DPDK_2.0; diff --git a/src/spdk/dpdk/lib/librte_kni/Makefile b/src/spdk/dpdk/lib/librte_kni/Makefile new file mode 100644 index 00000000..cbd6599e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kni/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_kni.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev + +EXPORT_MAP := rte_kni_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_KNI) := rte_kni.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_KNI)-include := rte_kni.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_kni/meson.build b/src/spdk/dpdk/lib/librte_kni/meson.build new file mode 100644 index 00000000..a738a033 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kni/meson.build @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +if host_machine.system() != 'linux' or cc.sizeof('void *') == 4 + build = false +endif +version = 2 +sources = files('rte_kni.c') +headers = files('rte_kni.h') +deps += ['ethdev', 'pci'] diff --git a/src/spdk/dpdk/lib/librte_kni/rte_kni.c b/src/spdk/dpdk/lib/librte_kni/rte_kni.c new file mode 100644 index 00000000..65f6a2b0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kni/rte_kni.c @@ -0,0 +1,801 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef RTE_EXEC_ENV_LINUXAPP +#error "KNI is not supported" +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "rte_kni_fifo.h" + +#define MAX_MBUF_BURST_NUM 32 + +/* Maximum number of ring entries */ +#define KNI_FIFO_COUNT_MAX 1024 +#define KNI_FIFO_SIZE (KNI_FIFO_COUNT_MAX * sizeof(void *) + \ + sizeof(struct rte_kni_fifo)) + +#define KNI_REQUEST_MBUF_NUM_MAX 32 + +#define KNI_MEM_CHECK(cond) do { if (cond) goto kni_fail; } while (0) + +/** + * KNI context + */ +struct rte_kni { + char name[RTE_KNI_NAMESIZE]; /**< KNI interface name */ + uint16_t group_id; /**< Group ID of KNI devices */ + uint32_t slot_id; /**< KNI pool slot ID */ + struct rte_mempool *pktmbuf_pool; /**< pkt mbuf mempool */ + unsigned mbuf_size; /**< mbuf size */ + + struct rte_kni_fifo *tx_q; /**< TX queue */ + struct rte_kni_fifo *rx_q; /**< RX queue */ + struct rte_kni_fifo *alloc_q; /**< Allocated mbufs queue */ + struct rte_kni_fifo *free_q; /**< To be freed mbufs queue */ + + /* For request & response */ + struct rte_kni_fifo *req_q; /**< Request queue */ + struct rte_kni_fifo *resp_q; /**< Response queue */ + void * sync_addr; /**< Req/Resp Mem address */ + + struct rte_kni_ops ops; /**< operations for request */ + uint8_t in_use : 1; /**< kni in use */ +}; + +enum kni_ops_status { + KNI_REQ_NO_REGISTER = 0, + KNI_REQ_REGISTERED, +}; + +/** + * KNI memzone pool slot + */ +struct rte_kni_memzone_slot { + uint32_t id; + uint8_t in_use : 1; /**< slot in use */ + + /* Memzones */ + const struct rte_memzone *m_ctx; /**< KNI ctx */ + const struct rte_memzone *m_tx_q; /**< TX queue */ + const struct rte_memzone *m_rx_q; /**< RX queue */ + const struct rte_memzone *m_alloc_q; /**< Allocated mbufs queue */ + const struct rte_memzone *m_free_q; /**< To be freed mbufs queue */ + const struct rte_memzone *m_req_q; /**< Request queue */ + const struct rte_memzone *m_resp_q; /**< Response queue */ + const struct rte_memzone *m_sync_addr; + + /* Free linked list */ + struct rte_kni_memzone_slot *next; /**< Next slot link.list */ +}; + +/** + * KNI memzone pool + */ +struct rte_kni_memzone_pool { + uint8_t initialized : 1; /**< Global KNI pool init flag */ + + uint32_t max_ifaces; /**< Max. num of KNI ifaces */ + struct rte_kni_memzone_slot *slots; /**< Pool slots */ + rte_spinlock_t mutex; /**< alloc/release mutex */ + + /* Free memzone slots linked-list */ + struct rte_kni_memzone_slot *free; /**< First empty slot */ + struct rte_kni_memzone_slot *free_tail; /**< Last empty slot */ +}; + + +static void kni_free_mbufs(struct rte_kni *kni); +static void kni_allocate_mbufs(struct rte_kni *kni); + +static volatile int kni_fd = -1; +static struct rte_kni_memzone_pool kni_memzone_pool = { + .initialized = 0, +}; + +static const struct rte_memzone * +kni_memzone_reserve(const char *name, size_t len, int socket_id, + unsigned flags) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz == NULL) + mz = rte_memzone_reserve(name, len, socket_id, flags); + + return mz; +} + +/* Pool mgmt */ +static struct rte_kni_memzone_slot* +kni_memzone_pool_alloc(void) +{ + struct rte_kni_memzone_slot *slot; + + rte_spinlock_lock(&kni_memzone_pool.mutex); + + if (!kni_memzone_pool.free) { + rte_spinlock_unlock(&kni_memzone_pool.mutex); + return NULL; + } + + slot = kni_memzone_pool.free; + kni_memzone_pool.free = slot->next; + slot->in_use = 1; + + if (!kni_memzone_pool.free) + kni_memzone_pool.free_tail = NULL; + + rte_spinlock_unlock(&kni_memzone_pool.mutex); + + return slot; +} + +static void +kni_memzone_pool_release(struct rte_kni_memzone_slot *slot) +{ + rte_spinlock_lock(&kni_memzone_pool.mutex); + + if (kni_memzone_pool.free) + kni_memzone_pool.free_tail->next = slot; + else + kni_memzone_pool.free = slot; + + kni_memzone_pool.free_tail = slot; + slot->next = NULL; + slot->in_use = 0; + + rte_spinlock_unlock(&kni_memzone_pool.mutex); +} + + +/* Shall be called before any allocation happens */ +void +rte_kni_init(unsigned int max_kni_ifaces) +{ + uint32_t i; + struct rte_kni_memzone_slot *it; + const struct rte_memzone *mz; +#define OBJNAMSIZ 32 + char obj_name[OBJNAMSIZ]; + char mz_name[RTE_MEMZONE_NAMESIZE]; + + /* Immediately return if KNI is already initialized */ + if (kni_memzone_pool.initialized) { + RTE_LOG(WARNING, KNI, "Double call to rte_kni_init()"); + return; + } + + if (max_kni_ifaces == 0) { + RTE_LOG(ERR, KNI, "Invalid number of max_kni_ifaces %d\n", + max_kni_ifaces); + RTE_LOG(ERR, KNI, "Unable to initialize KNI\n"); + return; + } + + /* Check FD and open */ + if (kni_fd < 0) { + kni_fd = open("/dev/" KNI_DEVICE, O_RDWR); + if (kni_fd < 0) { + RTE_LOG(ERR, KNI, + "Can not open /dev/%s\n", KNI_DEVICE); + return; + } + } + + /* Allocate slot objects */ + kni_memzone_pool.slots = (struct rte_kni_memzone_slot *) + rte_malloc(NULL, + sizeof(struct rte_kni_memzone_slot) * + max_kni_ifaces, + 0); + KNI_MEM_CHECK(kni_memzone_pool.slots == NULL); + + /* Initialize general pool variables */ + kni_memzone_pool.initialized = 1; + kni_memzone_pool.max_ifaces = max_kni_ifaces; + kni_memzone_pool.free = &kni_memzone_pool.slots[0]; + rte_spinlock_init(&kni_memzone_pool.mutex); + + /* Pre-allocate all memzones of all the slots; panic on error */ + for (i = 0; i < max_kni_ifaces; i++) { + + /* Recover current slot */ + it = &kni_memzone_pool.slots[i]; + it->id = i; + + /* Allocate KNI context */ + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, "KNI_INFO_%d", i); + mz = kni_memzone_reserve(mz_name, sizeof(struct rte_kni), + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_ctx = mz; + + /* TX RING */ + snprintf(obj_name, OBJNAMSIZ, "kni_tx_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_tx_q = mz; + + /* RX RING */ + snprintf(obj_name, OBJNAMSIZ, "kni_rx_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_rx_q = mz; + + /* ALLOC RING */ + snprintf(obj_name, OBJNAMSIZ, "kni_alloc_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_alloc_q = mz; + + /* FREE RING */ + snprintf(obj_name, OBJNAMSIZ, "kni_free_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_free_q = mz; + + /* Request RING */ + snprintf(obj_name, OBJNAMSIZ, "kni_req_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_req_q = mz; + + /* Response RING */ + snprintf(obj_name, OBJNAMSIZ, "kni_resp_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_resp_q = mz; + + /* Req/Resp sync mem area */ + snprintf(obj_name, OBJNAMSIZ, "kni_sync_%d", i); + mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE, + SOCKET_ID_ANY, 0); + KNI_MEM_CHECK(mz == NULL); + it->m_sync_addr = mz; + + if ((i+1) == max_kni_ifaces) { + it->next = NULL; + kni_memzone_pool.free_tail = it; + } else + it->next = &kni_memzone_pool.slots[i+1]; + } + + return; + +kni_fail: + RTE_LOG(ERR, KNI, "Unable to allocate memory for max_kni_ifaces:%d." + "Increase the amount of hugepages memory\n", max_kni_ifaces); +} + + +struct rte_kni * +rte_kni_alloc(struct rte_mempool *pktmbuf_pool, + const struct rte_kni_conf *conf, + struct rte_kni_ops *ops) +{ + int ret; + struct rte_kni_device_info dev_info; + struct rte_kni *ctx; + char intf_name[RTE_KNI_NAMESIZE]; + const struct rte_memzone *mz; + struct rte_kni_memzone_slot *slot = NULL; + + if (!pktmbuf_pool || !conf || !conf->name[0]) + return NULL; + + /* Check if KNI subsystem has been initialized */ + if (kni_memzone_pool.initialized != 1) { + RTE_LOG(ERR, KNI, "KNI subsystem has not been initialized. Invoke rte_kni_init() first\n"); + return NULL; + } + + /* Get an available slot from the pool */ + slot = kni_memzone_pool_alloc(); + if (!slot) { + RTE_LOG(ERR, KNI, "Cannot allocate more KNI interfaces; increase the number of max_kni_ifaces(current %d) or release unused ones.\n", + kni_memzone_pool.max_ifaces); + return NULL; + } + + /* Recover ctx */ + ctx = slot->m_ctx->addr; + snprintf(intf_name, RTE_KNI_NAMESIZE, "%s", conf->name); + + if (ctx->in_use) { + RTE_LOG(ERR, KNI, "KNI %s is in use\n", ctx->name); + return NULL; + } + memset(ctx, 0, sizeof(struct rte_kni)); + if (ops) + memcpy(&ctx->ops, ops, sizeof(struct rte_kni_ops)); + else + ctx->ops.port_id = UINT16_MAX; + + memset(&dev_info, 0, sizeof(dev_info)); + dev_info.bus = conf->addr.bus; + dev_info.devid = conf->addr.devid; + dev_info.function = conf->addr.function; + dev_info.vendor_id = conf->id.vendor_id; + dev_info.device_id = conf->id.device_id; + dev_info.core_id = conf->core_id; + dev_info.force_bind = conf->force_bind; + dev_info.group_id = conf->group_id; + dev_info.mbuf_size = conf->mbuf_size; + dev_info.mtu = conf->mtu; + + memcpy(dev_info.mac_addr, conf->mac_addr, ETHER_ADDR_LEN); + + snprintf(ctx->name, RTE_KNI_NAMESIZE, "%s", intf_name); + snprintf(dev_info.name, RTE_KNI_NAMESIZE, "%s", intf_name); + + RTE_LOG(INFO, KNI, "pci: %02x:%02x:%02x \t %02x:%02x\n", + dev_info.bus, dev_info.devid, dev_info.function, + dev_info.vendor_id, dev_info.device_id); + /* TX RING */ + mz = slot->m_tx_q; + ctx->tx_q = mz->addr; + kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX); + dev_info.tx_phys = mz->phys_addr; + + /* RX RING */ + mz = slot->m_rx_q; + ctx->rx_q = mz->addr; + kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX); + dev_info.rx_phys = mz->phys_addr; + + /* ALLOC RING */ + mz = slot->m_alloc_q; + ctx->alloc_q = mz->addr; + kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX); + dev_info.alloc_phys = mz->phys_addr; + + /* FREE RING */ + mz = slot->m_free_q; + ctx->free_q = mz->addr; + kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX); + dev_info.free_phys = mz->phys_addr; + + /* Request RING */ + mz = slot->m_req_q; + ctx->req_q = mz->addr; + kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX); + dev_info.req_phys = mz->phys_addr; + + /* Response RING */ + mz = slot->m_resp_q; + ctx->resp_q = mz->addr; + kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX); + dev_info.resp_phys = mz->phys_addr; + + /* Req/Resp sync mem area */ + mz = slot->m_sync_addr; + ctx->sync_addr = mz->addr; + dev_info.sync_va = mz->addr; + dev_info.sync_phys = mz->phys_addr; + + ctx->pktmbuf_pool = pktmbuf_pool; + ctx->group_id = conf->group_id; + ctx->slot_id = slot->id; + ctx->mbuf_size = conf->mbuf_size; + + ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); + KNI_MEM_CHECK(ret < 0); + + ctx->in_use = 1; + + /* Allocate mbufs and then put them into alloc_q */ + kni_allocate_mbufs(ctx); + + return ctx; + +kni_fail: + if (slot) + kni_memzone_pool_release(&kni_memzone_pool.slots[slot->id]); + + return NULL; +} + +static void +kni_free_fifo(struct rte_kni_fifo *fifo) +{ + int ret; + struct rte_mbuf *pkt; + + do { + ret = kni_fifo_get(fifo, (void **)&pkt, 1); + if (ret) + rte_pktmbuf_free(pkt); + } while (ret); +} + +static void * +va2pa(struct rte_mbuf *m) +{ + return (void *)((unsigned long)m - + ((unsigned long)m->buf_addr - + (unsigned long)m->buf_iova)); +} + +static void +obj_free(struct rte_mempool *mp __rte_unused, void *opaque, void *obj, + unsigned obj_idx __rte_unused) +{ + struct rte_mbuf *m = obj; + void *mbuf_phys = opaque; + + if (va2pa(m) == mbuf_phys) + rte_pktmbuf_free(m); +} + +static void +kni_free_fifo_phy(struct rte_mempool *mp, struct rte_kni_fifo *fifo) +{ + void *mbuf_phys; + int ret; + + do { + ret = kni_fifo_get(fifo, &mbuf_phys, 1); + if (ret) + rte_mempool_obj_iter(mp, obj_free, mbuf_phys); + } while (ret); +} + +int +rte_kni_release(struct rte_kni *kni) +{ + struct rte_kni_device_info dev_info; + uint32_t slot_id; + uint32_t retry = 5; + + if (!kni || !kni->in_use) + return -1; + + snprintf(dev_info.name, sizeof(dev_info.name), "%s", kni->name); + if (ioctl(kni_fd, RTE_KNI_IOCTL_RELEASE, &dev_info) < 0) { + RTE_LOG(ERR, KNI, "Fail to release kni device\n"); + return -1; + } + + /* mbufs in all fifo should be released, except request/response */ + + /* wait until all rxq packets processed by kernel */ + while (kni_fifo_count(kni->rx_q) && retry--) + usleep(1000); + + if (kni_fifo_count(kni->rx_q)) + RTE_LOG(ERR, KNI, "Fail to free all Rx-q items\n"); + + kni_free_fifo_phy(kni->pktmbuf_pool, kni->alloc_q); + kni_free_fifo(kni->tx_q); + kni_free_fifo(kni->free_q); + + slot_id = kni->slot_id; + + /* Memset the KNI struct */ + memset(kni, 0, sizeof(struct rte_kni)); + + /* Release memzone */ + if (slot_id > kni_memzone_pool.max_ifaces) { + RTE_LOG(ERR, KNI, "KNI pool: corrupted slot ID: %d, max: %d\n", + slot_id, kni_memzone_pool.max_ifaces); + return -1; + } + kni_memzone_pool_release(&kni_memzone_pool.slots[slot_id]); + + return 0; +} + +/* default callback for request of configuring device mac address */ +static int +kni_config_mac_address(uint16_t port_id, uint8_t mac_addr[]) +{ + int ret = 0; + + if (!rte_eth_dev_is_valid_port(port_id)) { + RTE_LOG(ERR, KNI, "Invalid port id %d\n", port_id); + return -EINVAL; + } + + RTE_LOG(INFO, KNI, "Configure mac address of %d", port_id); + + ret = rte_eth_dev_default_mac_addr_set(port_id, + (struct ether_addr *)mac_addr); + if (ret < 0) + RTE_LOG(ERR, KNI, "Failed to config mac_addr for port %d\n", + port_id); + + return ret; +} + +/* default callback for request of configuring promiscuous mode */ +static int +kni_config_promiscusity(uint16_t port_id, uint8_t to_on) +{ + if (!rte_eth_dev_is_valid_port(port_id)) { + RTE_LOG(ERR, KNI, "Invalid port id %d\n", port_id); + return -EINVAL; + } + + RTE_LOG(INFO, KNI, "Configure promiscuous mode of %d to %d\n", + port_id, to_on); + + if (to_on) + rte_eth_promiscuous_enable(port_id); + else + rte_eth_promiscuous_disable(port_id); + + return 0; +} + +int +rte_kni_handle_request(struct rte_kni *kni) +{ + unsigned ret; + struct rte_kni_request *req; + + if (kni == NULL) + return -1; + + /* Get request mbuf */ + ret = kni_fifo_get(kni->req_q, (void **)&req, 1); + if (ret != 1) + return 0; /* It is OK of can not getting the request mbuf */ + + if (req != kni->sync_addr) { + RTE_LOG(ERR, KNI, "Wrong req pointer %p\n", req); + return -1; + } + + /* Analyze the request and call the relevant actions for it */ + switch (req->req_id) { + case RTE_KNI_REQ_CHANGE_MTU: /* Change MTU */ + if (kni->ops.change_mtu) + req->result = kni->ops.change_mtu(kni->ops.port_id, + req->new_mtu); + break; + case RTE_KNI_REQ_CFG_NETWORK_IF: /* Set network interface up/down */ + if (kni->ops.config_network_if) + req->result = kni->ops.config_network_if(\ + kni->ops.port_id, req->if_up); + break; + case RTE_KNI_REQ_CHANGE_MAC_ADDR: /* Change MAC Address */ + if (kni->ops.config_mac_address) + req->result = kni->ops.config_mac_address( + kni->ops.port_id, req->mac_addr); + else if (kni->ops.port_id != UINT16_MAX) + req->result = kni_config_mac_address( + kni->ops.port_id, req->mac_addr); + break; + case RTE_KNI_REQ_CHANGE_PROMISC: /* Change PROMISCUOUS MODE */ + if (kni->ops.config_promiscusity) + req->result = kni->ops.config_promiscusity( + kni->ops.port_id, req->promiscusity); + else if (kni->ops.port_id != UINT16_MAX) + req->result = kni_config_promiscusity( + kni->ops.port_id, req->promiscusity); + break; + default: + RTE_LOG(ERR, KNI, "Unknown request id %u\n", req->req_id); + req->result = -EINVAL; + break; + } + + /* Construct response mbuf and put it back to resp_q */ + ret = kni_fifo_put(kni->resp_q, (void **)&req, 1); + if (ret != 1) { + RTE_LOG(ERR, KNI, "Fail to put the muf back to resp_q\n"); + return -1; /* It is an error of can't putting the mbuf back */ + } + + return 0; +} + +unsigned +rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num) +{ + void *phy_mbufs[num]; + unsigned int ret; + unsigned int i; + + for (i = 0; i < num; i++) + phy_mbufs[i] = va2pa(mbufs[i]); + + ret = kni_fifo_put(kni->rx_q, phy_mbufs, num); + + /* Get mbufs from free_q and then free them */ + kni_free_mbufs(kni); + + return ret; +} + +unsigned +rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num) +{ + unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num); + + /* If buffers removed, allocate mbufs and then put them into alloc_q */ + if (ret) + kni_allocate_mbufs(kni); + + return ret; +} + +static void +kni_free_mbufs(struct rte_kni *kni) +{ + int i, ret; + struct rte_mbuf *pkts[MAX_MBUF_BURST_NUM]; + + ret = kni_fifo_get(kni->free_q, (void **)pkts, MAX_MBUF_BURST_NUM); + if (likely(ret > 0)) { + for (i = 0; i < ret; i++) + rte_pktmbuf_free(pkts[i]); + } +} + +static void +kni_allocate_mbufs(struct rte_kni *kni) +{ + int i, ret; + struct rte_mbuf *pkts[MAX_MBUF_BURST_NUM]; + void *phys[MAX_MBUF_BURST_NUM]; + int allocq_free; + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pool) != + offsetof(struct rte_kni_mbuf, pool)); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_addr) != + offsetof(struct rte_kni_mbuf, buf_addr)); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, next) != + offsetof(struct rte_kni_mbuf, next)); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) != + offsetof(struct rte_kni_mbuf, data_off)); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_kni_mbuf, data_len)); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_kni_mbuf, pkt_len)); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_kni_mbuf, ol_flags)); + + /* Check if pktmbuf pool has been configured */ + if (kni->pktmbuf_pool == NULL) { + RTE_LOG(ERR, KNI, "No valid mempool for allocating mbufs\n"); + return; + } + + allocq_free = (kni->alloc_q->read - kni->alloc_q->write - 1) \ + & (MAX_MBUF_BURST_NUM - 1); + for (i = 0; i < allocq_free; i++) { + pkts[i] = rte_pktmbuf_alloc(kni->pktmbuf_pool); + if (unlikely(pkts[i] == NULL)) { + /* Out of memory */ + RTE_LOG(ERR, KNI, "Out of memory\n"); + break; + } + phys[i] = va2pa(pkts[i]); + } + + /* No pkt mbuf allocated */ + if (i <= 0) + return; + + ret = kni_fifo_put(kni->alloc_q, phys, i); + + /* Check if any mbufs not put into alloc_q, and then free them */ + if (ret >= 0 && ret < i && ret < MAX_MBUF_BURST_NUM) { + int j; + + for (j = ret; j < i; j++) + rte_pktmbuf_free(pkts[j]); + } +} + +struct rte_kni * +rte_kni_get(const char *name) +{ + uint32_t i; + struct rte_kni_memzone_slot *it; + struct rte_kni *kni; + + if (name == NULL || name[0] == '\0') + return NULL; + + /* Note: could be improved perf-wise if necessary */ + for (i = 0; i < kni_memzone_pool.max_ifaces; i++) { + it = &kni_memzone_pool.slots[i]; + if (it->in_use == 0) + continue; + kni = it->m_ctx->addr; + if (strncmp(kni->name, name, RTE_KNI_NAMESIZE) == 0) + return kni; + } + + return NULL; +} + +const char * +rte_kni_get_name(const struct rte_kni *kni) +{ + return kni->name; +} + +static enum kni_ops_status +kni_check_request_register(struct rte_kni_ops *ops) +{ + /* check if KNI request ops has been registered*/ + if( NULL == ops ) + return KNI_REQ_NO_REGISTER; + + if ((ops->change_mtu == NULL) + && (ops->config_network_if == NULL) + && (ops->config_mac_address == NULL) + && (ops->config_promiscusity == NULL)) + return KNI_REQ_NO_REGISTER; + + return KNI_REQ_REGISTERED; +} + +int +rte_kni_register_handlers(struct rte_kni *kni,struct rte_kni_ops *ops) +{ + enum kni_ops_status req_status; + + if (NULL == ops) { + RTE_LOG(ERR, KNI, "Invalid KNI request operation.\n"); + return -1; + } + + if (NULL == kni) { + RTE_LOG(ERR, KNI, "Invalid kni info.\n"); + return -1; + } + + req_status = kni_check_request_register(&kni->ops); + if ( KNI_REQ_REGISTERED == req_status) { + RTE_LOG(ERR, KNI, "The KNI request operation has already registered.\n"); + return -1; + } + + memcpy(&kni->ops, ops, sizeof(struct rte_kni_ops)); + return 0; +} + +int +rte_kni_unregister_handlers(struct rte_kni *kni) +{ + if (NULL == kni) { + RTE_LOG(ERR, KNI, "Invalid kni info.\n"); + return -1; + } + + memset(&kni->ops, 0, sizeof(struct rte_kni_ops)); + + return 0; +} +void +rte_kni_close(void) +{ + if (kni_fd < 0) + return; + + close(kni_fd); + kni_fd = -1; +} diff --git a/src/spdk/dpdk/lib/librte_kni/rte_kni.h b/src/spdk/dpdk/lib/librte_kni/rte_kni.h new file mode 100644 index 00000000..99055e2c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kni/rte_kni.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_KNI_H_ +#define _RTE_KNI_H_ + +/** + * @file + * RTE KNI + * + * The KNI library provides the ability to create and destroy kernel NIC + * interfaces that may be used by the RTE application to receive/transmit + * packets from/to Linux kernel net interfaces. + * + * This library provides two APIs to burst receive packets from KNI interfaces, + * and burst transmit packets to KNI interfaces. + */ + +#include +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct rte_kni; +struct rte_mbuf; + +/** + * Structure which has the function pointers for KNI interface. + */ +struct rte_kni_ops { + uint16_t port_id; /* Port ID */ + + /* Pointer to function of changing MTU */ + int (*change_mtu)(uint16_t port_id, unsigned int new_mtu); + + /* Pointer to function of configuring network interface */ + int (*config_network_if)(uint16_t port_id, uint8_t if_up); + + /* Pointer to function of configuring mac address */ + int (*config_mac_address)(uint16_t port_id, uint8_t mac_addr[]); + + /* Pointer to function of configuring promiscuous mode */ + int (*config_promiscusity)(uint16_t port_id, uint8_t to_on); +}; + +/** + * Structure for configuring KNI device. + */ +struct rte_kni_conf { + /* + * KNI name which will be used in relevant network device. + * Let the name as short as possible, as it will be part of + * memzone name. + */ + char name[RTE_KNI_NAMESIZE]; + uint32_t core_id; /* Core ID to bind kernel thread on */ + uint16_t group_id; /* Group ID */ + unsigned mbuf_size; /* mbuf size */ + struct rte_pci_addr addr; + struct rte_pci_id id; + + __extension__ + uint8_t force_bind : 1; /* Flag to bind kernel thread */ + char mac_addr[ETHER_ADDR_LEN]; /* MAC address assigned to KNI */ + uint16_t mtu; +}; + +/** + * Initialize and preallocate KNI subsystem + * + * This function is to be executed on the MASTER lcore only, after EAL + * initialization and before any KNI interface is attempted to be + * allocated + * + * @param max_kni_ifaces + * The maximum number of KNI interfaces that can coexist concurrently + */ +void rte_kni_init(unsigned int max_kni_ifaces); + + +/** + * Allocate KNI interface according to the port id, mbuf size, mbuf pool, + * configurations and callbacks for kernel requests.The KNI interface created + * in the kernel space is the net interface the traditional Linux application + * talking to. + * + * The rte_kni_alloc shall not be called before rte_kni_init() has been + * called. rte_kni_alloc is thread safe. + * + * The mempool should have capacity of more than "2 x KNI_FIFO_COUNT_MAX" + * elements for each KNI interface allocated. + * + * @param pktmbuf_pool + * The mempool for allocating mbufs for packets. + * @param conf + * The pointer to the configurations of the KNI device. + * @param ops + * The pointer to the callbacks for the KNI kernel requests. + * + * @return + * - The pointer to the context of a KNI interface. + * - NULL indicate error. + */ +struct rte_kni *rte_kni_alloc(struct rte_mempool *pktmbuf_pool, + const struct rte_kni_conf *conf, struct rte_kni_ops *ops); + +/** + * Release KNI interface according to the context. It will also release the + * paired KNI interface in kernel space. All processing on the specific KNI + * context need to be stopped before calling this interface. + * + * rte_kni_release is thread safe. + * + * @param kni + * The pointer to the context of an existent KNI interface. + * + * @return + * - 0 indicates success. + * - negative value indicates failure. + */ +int rte_kni_release(struct rte_kni *kni); + +/** + * It is used to handle the request mbufs sent from kernel space. + * Then analyzes it and calls the specific actions for the specific requests. + * Finally constructs the response mbuf and puts it back to the resp_q. + * + * @param kni + * The pointer to the context of an existent KNI interface. + * + * @return + * - 0 + * - negative value indicates failure. + */ +int rte_kni_handle_request(struct rte_kni *kni); + +/** + * Retrieve a burst of packets from a KNI interface. The retrieved packets are + * stored in rte_mbuf structures whose pointers are supplied in the array of + * mbufs, and the maximum number is indicated by num. It handles allocating + * the mbufs for KNI interface alloc queue. + * + * @param kni + * The KNI interface context. + * @param mbufs + * The array to store the pointers of mbufs. + * @param num + * The maximum number per burst. + * + * @return + * The actual number of packets retrieved. + */ +unsigned rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, + unsigned num); + +/** + * Send a burst of packets to a KNI interface. The packets to be sent out are + * stored in rte_mbuf structures whose pointers are supplied in the array of + * mbufs, and the maximum number is indicated by num. It handles the freeing of + * the mbufs in the free queue of KNI interface. + * + * @param kni + * The KNI interface context. + * @param mbufs + * The array to store the pointers of mbufs. + * @param num + * The maximum number per burst. + * + * @return + * The actual number of packets sent. + */ +unsigned rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, + unsigned num); + +/** + * Get the KNI context of its name. + * + * @param name + * pointer to the KNI device name. + * + * @return + * On success: Pointer to KNI interface. + * On failure: NULL. + */ +struct rte_kni *rte_kni_get(const char *name); + +/** + * Get the name given to a KNI device + * + * @param kni + * The KNI instance to query + * @return + * The pointer to the KNI name + */ +const char *rte_kni_get_name(const struct rte_kni *kni); + +/** + * Register KNI request handling for a specified port,and it can + * be called by master process or slave process. + * + * @param kni + * pointer to struct rte_kni. + * @param ops + * pointer to struct rte_kni_ops. + * + * @return + * On success: 0 + * On failure: -1 + */ +int rte_kni_register_handlers(struct rte_kni *kni, struct rte_kni_ops *ops); + +/** + * Unregister KNI request handling for a specified port. + * + * @param kni + * pointer to struct rte_kni. + * + * @return + * On success: 0 + * On failure: -1 + */ +int rte_kni_unregister_handlers(struct rte_kni *kni); + +/** + * Close KNI device. + */ +void rte_kni_close(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_KNI_H_ */ diff --git a/src/spdk/dpdk/lib/librte_kni/rte_kni_fifo.h b/src/spdk/dpdk/lib/librte_kni/rte_kni_fifo.h new file mode 100644 index 00000000..ac26a8c0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kni/rte_kni_fifo.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + + + +/** + * Initializes the kni fifo structure + */ +static void +kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size) +{ + /* Ensure size is power of 2 */ + if (size & (size - 1)) + rte_panic("KNI fifo size must be power of 2\n"); + + fifo->write = 0; + fifo->read = 0; + fifo->len = size; + fifo->elem_size = sizeof(void *); +} + +/** + * Adds num elements into the fifo. Return the number actually written + */ +static inline unsigned +kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num) +{ + unsigned i = 0; + unsigned fifo_write = fifo->write; + unsigned fifo_read = fifo->read; + unsigned new_write = fifo_write; + + for (i = 0; i < num; i++) { + new_write = (new_write + 1) & (fifo->len - 1); + + if (new_write == fifo_read) + break; + fifo->buffer[fifo_write] = data[i]; + fifo_write = new_write; + } + fifo->write = fifo_write; + return i; +} + +/** + * Get up to num elements from the fifo. Return the number actually read + */ +static inline unsigned +kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num) +{ + unsigned i = 0; + unsigned new_read = fifo->read; + unsigned fifo_write = fifo->write; + for (i = 0; i < num; i++) { + if (new_read == fifo_write) + break; + + data[i] = fifo->buffer[new_read]; + new_read = (new_read + 1) & (fifo->len - 1); + } + fifo->read = new_read; + return i; +} + +/** + * Get the num of elements in the fifo + */ +static inline uint32_t +kni_fifo_count(struct rte_kni_fifo *fifo) +{ + return (fifo->len + fifo->write - fifo->read) & (fifo->len - 1); +} diff --git a/src/spdk/dpdk/lib/librte_kni/rte_kni_version.map b/src/spdk/dpdk/lib/librte_kni/rte_kni_version.map new file mode 100644 index 00000000..acd515eb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kni/rte_kni_version.map @@ -0,0 +1,17 @@ +DPDK_2.0 { + global: + + rte_kni_alloc; + rte_kni_close; + rte_kni_get; + rte_kni_get_name; + rte_kni_handle_request; + rte_kni_init; + rte_kni_register_handlers; + rte_kni_release; + rte_kni_rx_burst; + rte_kni_tx_burst; + rte_kni_unregister_handlers; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_kvargs/Makefile b/src/spdk/dpdk/lib/librte_kvargs/Makefile new file mode 100644 index 00000000..87593954 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kvargs/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2014 6WIND S.A. + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_kvargs.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include + +EXPORT_MAP := rte_kvargs_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_KVARGS) := rte_kvargs.c + +# install includes +INCS := rte_kvargs.h +SYMLINK-$(CONFIG_RTE_LIBRTE_KVARGS)-include := $(INCS) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_kvargs/meson.build b/src/spdk/dpdk/lib/librte_kvargs/meson.build new file mode 100644 index 00000000..acd3e543 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kvargs/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +includes = [global_inc] +includes += include_directories('../librte_eal/common/include') + +version = 1 +sources = files('rte_kvargs.c') +headers = files('rte_kvargs.h') + +deps += 'compat' diff --git a/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.c b/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.c new file mode 100644 index 00000000..a28f7694 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.c @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2013 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include +#include + +#include + +#include "rte_kvargs.h" + +/* + * Receive a string with a list of arguments following the pattern + * key=value,key=value,... and insert them into the list. + * strtok() is used so the params string will be copied to be modified. + */ +static int +rte_kvargs_tokenize(struct rte_kvargs *kvlist, const char *params) +{ + unsigned i; + char *str; + char *ctx1 = NULL; + char *ctx2 = NULL; + + /* Copy the const char *params to a modifiable string + * to pass to rte_strsplit + */ + kvlist->str = strdup(params); + if (kvlist->str == NULL) + return -1; + + /* browse each key/value pair and add it in kvlist */ + str = kvlist->str; + while ((str = strtok_r(str, RTE_KVARGS_PAIRS_DELIM, &ctx1)) != NULL) { + + i = kvlist->count; + if (i >= RTE_KVARGS_MAX) + return -1; + + kvlist->pairs[i].key = strtok_r(str, RTE_KVARGS_KV_DELIM, &ctx2); + kvlist->pairs[i].value = strtok_r(NULL, RTE_KVARGS_KV_DELIM, &ctx2); + if (kvlist->pairs[i].key == NULL || + kvlist->pairs[i].value == NULL) + return -1; + + kvlist->count++; + str = NULL; + } + + return 0; +} + +/* + * Determine whether a key is valid or not by looking + * into a list of valid keys. + */ +static int +is_valid_key(const char * const valid[], const char *key_match) +{ + const char * const *valid_ptr; + + for (valid_ptr = valid; *valid_ptr != NULL; valid_ptr++) { + if (strcmp(key_match, *valid_ptr) == 0) + return 1; + } + return 0; +} + +/* + * Determine whether all keys are valid or not by looking + * into a list of valid keys. + */ +static int +check_for_valid_keys(struct rte_kvargs *kvlist, + const char * const valid[]) +{ + unsigned i, ret; + struct rte_kvargs_pair *pair; + + for (i = 0; i < kvlist->count; i++) { + pair = &kvlist->pairs[i]; + ret = is_valid_key(valid, pair->key); + if (!ret) + return -1; + } + return 0; +} + +/* + * Return the number of times a given arg_name exists in the key/value list. + * E.g. given a list = { rx = 0, rx = 1, tx = 2 } the number of args for + * arg "rx" will be 2. + */ +unsigned +rte_kvargs_count(const struct rte_kvargs *kvlist, const char *key_match) +{ + const struct rte_kvargs_pair *pair; + unsigned i, ret; + + ret = 0; + for (i = 0; i < kvlist->count; i++) { + pair = &kvlist->pairs[i]; + if (key_match == NULL || strcmp(pair->key, key_match) == 0) + ret++; + } + + return ret; +} + +/* + * For each matching key, call the given handler function. + */ +int +rte_kvargs_process(const struct rte_kvargs *kvlist, + const char *key_match, + arg_handler_t handler, + void *opaque_arg) +{ + const struct rte_kvargs_pair *pair; + unsigned i; + + for (i = 0; i < kvlist->count; i++) { + pair = &kvlist->pairs[i]; + if (key_match == NULL || strcmp(pair->key, key_match) == 0) { + if ((*handler)(pair->key, pair->value, opaque_arg) < 0) + return -1; + } + } + return 0; +} + +/* free the rte_kvargs structure */ +void +rte_kvargs_free(struct rte_kvargs *kvlist) +{ + if (!kvlist) + return; + + free(kvlist->str); + free(kvlist); +} + +/* + * Parse the arguments "key=value,key=value,..." string and return + * an allocated structure that contains a key/value list. Also + * check if only valid keys were used. + */ +struct rte_kvargs * +rte_kvargs_parse(const char *args, const char * const valid_keys[]) +{ + struct rte_kvargs *kvlist; + + kvlist = malloc(sizeof(*kvlist)); + if (kvlist == NULL) + return NULL; + memset(kvlist, 0, sizeof(*kvlist)); + + if (rte_kvargs_tokenize(kvlist, args) < 0) { + rte_kvargs_free(kvlist); + return NULL; + } + + if (valid_keys != NULL && check_for_valid_keys(kvlist, valid_keys) < 0) { + rte_kvargs_free(kvlist); + return NULL; + } + + return kvlist; +} + +__rte_experimental +struct rte_kvargs * +rte_kvargs_parse_delim(const char *args, const char * const valid_keys[], + const char *valid_ends) +{ + struct rte_kvargs *kvlist = NULL; + char *copy; + size_t len; + + if (valid_ends == NULL) + return rte_kvargs_parse(args, valid_keys); + + copy = strdup(args); + if (copy == NULL) + return NULL; + + len = strcspn(copy, valid_ends); + copy[len] = '\0'; + + kvlist = rte_kvargs_parse(copy, valid_keys); + + free(copy); + return kvlist; +} + +__rte_experimental +int +rte_kvargs_strcmp(const char *key __rte_unused, + const char *value, void *opaque) +{ + const char *str = opaque; + + return -abs(strcmp(str, value)); +} diff --git a/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.h b/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.h new file mode 100644 index 00000000..fc041956 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2013 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#ifndef _RTE_KVARGS_H_ +#define _RTE_KVARGS_H_ + +/** + * @file + * RTE Argument parsing + * + * This module can be used to parse arguments whose format is + * key1=value1,key2=value2,key3=value3,... + * + * The same key can appear several times with the same or a different + * value. Indeed, the arguments are stored as a list of key/values + * associations and not as a dictionary. + * + * This file provides some helpers that are especially used by virtual + * ethernet devices at initialization for arguments parsing. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** Maximum number of key/value associations */ +#define RTE_KVARGS_MAX 32 + +/** separator character used between each pair */ +#define RTE_KVARGS_PAIRS_DELIM "," + +/** separator character used between key and value */ +#define RTE_KVARGS_KV_DELIM "=" + +/** Type of callback function used by rte_kvargs_process() */ +typedef int (*arg_handler_t)(const char *key, const char *value, void *opaque); + +/** A key/value association */ +struct rte_kvargs_pair { + char *key; /**< the name (key) of the association */ + char *value; /**< the value associated to that key */ +}; + +/** Store a list of key/value associations */ +struct rte_kvargs { + char *str; /**< copy of the argument string */ + unsigned count; /**< number of entries in the list */ + struct rte_kvargs_pair pairs[RTE_KVARGS_MAX]; /**< list of key/values */ +}; + +/** + * Allocate a rte_kvargs and store key/value associations from a string + * + * The function allocates and fills a rte_kvargs structure from a given + * string whose format is key1=value1,key2=value2,... + * + * The structure can be freed with rte_kvargs_free(). + * + * @param args + * The input string containing the key/value associations + * @param valid_keys + * A list of valid keys (table of const char *, the last must be NULL). + * This argument is ignored if NULL + * + * @return + * - A pointer to an allocated rte_kvargs structure on success + * - NULL on error + */ +struct rte_kvargs *rte_kvargs_parse(const char *args, + const char *const valid_keys[]); + +/** + * Allocate a rte_kvargs and store key/value associations from a string. + * This version will consider any byte from valid_ends as a possible + * terminating character, and will not parse beyond any of their occurrence. + * + * The function allocates and fills an rte_kvargs structure from a given + * string whose format is key1=value1,key2=value2,... + * + * The structure can be freed with rte_kvargs_free(). + * + * @param args + * The input string containing the key/value associations + * + * @param valid_keys + * A list of valid keys (table of const char *, the last must be NULL). + * This argument is ignored if NULL + * + * @param valid_ends + * Acceptable terminating characters. + * If NULL, the behavior is the same as ``rte_kvargs_parse``. + * + * @return + * - A pointer to an allocated rte_kvargs structure on success + * - NULL on error + */ +__rte_experimental +struct rte_kvargs *rte_kvargs_parse_delim(const char *args, + const char *const valid_keys[], + const char *valid_ends); + +/** + * Free a rte_kvargs structure + * + * Free a rte_kvargs structure previously allocated with + * rte_kvargs_parse(). + * + * @param kvlist + * The rte_kvargs structure + */ +void rte_kvargs_free(struct rte_kvargs *kvlist); + +/** + * Call a handler function for each key/value matching the key + * + * For each key/value association that matches the given key, calls the + * handler function with the for a given arg_name passing the value on the + * dictionary for that key and a given extra argument. If *kvlist* is NULL + * function does nothing. + * + * @param kvlist + * The rte_kvargs structure + * @param key_match + * The key on which the handler should be called, or NULL to process handler + * on all associations + * @param handler + * The function to call for each matching key + * @param opaque_arg + * A pointer passed unchanged to the handler + * + * @return + * - 0 on success + * - Negative on error + */ +int rte_kvargs_process(const struct rte_kvargs *kvlist, + const char *key_match, arg_handler_t handler, void *opaque_arg); + +/** + * Count the number of associations matching the given key + * + * @param kvlist + * The rte_kvargs structure + * @param key_match + * The key that should match, or NULL to count all associations + + * @return + * The number of entries + */ +unsigned rte_kvargs_count(const struct rte_kvargs *kvlist, + const char *key_match); + +/** + * Generic kvarg handler for string comparison. + * + * This function can be used for a generic string comparison processing + * on a list of kvargs. + * + * @param key + * kvarg pair key. + * + * @param value + * kvarg pair value. + * + * @param opaque + * Opaque pointer to a string. + * + * @return + * 0 if the strings match. + * !0 otherwise or on error. + * + * Unless strcmp, comparison ordering is not kept. + * In order for rte_kvargs_process to stop processing on match error, + * a negative value is returned even if strcmp had returned a positive one. + */ +__rte_experimental +int rte_kvargs_strcmp(const char *key, const char *value, void *opaque); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs_version.map b/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs_version.map new file mode 100644 index 00000000..8f4b4e3f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_kvargs/rte_kvargs_version.map @@ -0,0 +1,18 @@ +DPDK_2.0 { + global: + + rte_kvargs_count; + rte_kvargs_free; + rte_kvargs_parse; + rte_kvargs_process; + + local: *; +}; + +EXPERIMENTAL { + global: + + rte_kvargs_parse_delim; + rte_kvargs_strcmp; + +} DPDK_2.0; diff --git a/src/spdk/dpdk/lib/librte_latencystats/Makefile b/src/spdk/dpdk/lib/librte_latencystats/Makefile new file mode 100644 index 00000000..ae0dbd8f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_latencystats/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_latencystats.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lm +LDLIBS += -lpthread +LDLIBS += -lrte_eal -lrte_metrics -lrte_ethdev -lrte_mbuf + +EXPORT_MAP := rte_latencystats_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) := rte_latencystats.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_LATENCY_STATS)-include := rte_latencystats.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_latencystats/meson.build b/src/spdk/dpdk/lib/librte_latencystats/meson.build new file mode 100644 index 00000000..286558dd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_latencystats/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_latencystats.c') +headers = files('rte_latencystats.h') +deps += ['metrics', 'ethdev'] diff --git a/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.c b/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.c new file mode 100644 index 00000000..1fdec68e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.c @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "rte_latencystats.h" + +/** Nano seconds per second */ +#define NS_PER_SEC 1E9 + +/** Clock cycles per nano second */ +static uint64_t +latencystat_cycles_per_ns(void) +{ + return rte_get_timer_hz() / NS_PER_SEC; +} + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_LATENCY_STATS RTE_LOGTYPE_USER1 + +static const char *MZ_RTE_LATENCY_STATS = "rte_latencystats"; +static int latency_stats_index; +static uint64_t samp_intvl; +static uint64_t timer_tsc; +static uint64_t prev_tsc; + +struct rte_latency_stats { + float min_latency; /**< Minimum latency in nano seconds */ + float avg_latency; /**< Average latency in nano seconds */ + float max_latency; /**< Maximum latency in nano seconds */ + float jitter; /** Latency variation */ +}; + +static struct rte_latency_stats *glob_stats; + +struct rxtx_cbs { + const struct rte_eth_rxtx_callback *cb; +}; + +static struct rxtx_cbs rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT]; +static struct rxtx_cbs tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT]; + +struct latency_stats_nameoff { + char name[RTE_ETH_XSTATS_NAME_SIZE]; + unsigned int offset; +}; + +static const struct latency_stats_nameoff lat_stats_strings[] = { + {"min_latency_ns", offsetof(struct rte_latency_stats, min_latency)}, + {"avg_latency_ns", offsetof(struct rte_latency_stats, avg_latency)}, + {"max_latency_ns", offsetof(struct rte_latency_stats, max_latency)}, + {"jitter_ns", offsetof(struct rte_latency_stats, jitter)}, +}; + +#define NUM_LATENCY_STATS (sizeof(lat_stats_strings) / \ + sizeof(lat_stats_strings[0])) + +int32_t +rte_latencystats_update(void) +{ + unsigned int i; + float *stats_ptr = NULL; + uint64_t values[NUM_LATENCY_STATS] = {0}; + int ret; + + for (i = 0; i < NUM_LATENCY_STATS; i++) { + stats_ptr = RTE_PTR_ADD(glob_stats, + lat_stats_strings[i].offset); + values[i] = (uint64_t)floor((*stats_ptr)/ + latencystat_cycles_per_ns()); + } + + ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, + latency_stats_index, + values, NUM_LATENCY_STATS); + if (ret < 0) + RTE_LOG(INFO, LATENCY_STATS, "Failed to push the stats\n"); + + return ret; +} + +static void +rte_latencystats_fill_values(struct rte_metric_value *values) +{ + unsigned int i; + float *stats_ptr = NULL; + + for (i = 0; i < NUM_LATENCY_STATS; i++) { + stats_ptr = RTE_PTR_ADD(glob_stats, + lat_stats_strings[i].offset); + values[i].key = i; + values[i].value = (uint64_t)floor((*stats_ptr)/ + latencystat_cycles_per_ns()); + } +} + +static uint16_t +add_time_stamps(uint16_t pid __rte_unused, + uint16_t qid __rte_unused, + struct rte_mbuf **pkts, + uint16_t nb_pkts, + uint16_t max_pkts __rte_unused, + void *user_cb __rte_unused) +{ + unsigned int i; + uint64_t diff_tsc, now; + + /* + * For every sample interval, + * time stamp is marked on one received packet. + */ + now = rte_rdtsc(); + for (i = 0; i < nb_pkts; i++) { + diff_tsc = now - prev_tsc; + timer_tsc += diff_tsc; + if (timer_tsc >= samp_intvl) { + pkts[i]->timestamp = now; + timer_tsc = 0; + } + prev_tsc = now; + now = rte_rdtsc(); + } + + return nb_pkts; +} + +static uint16_t +calc_latency(uint16_t pid __rte_unused, + uint16_t qid __rte_unused, + struct rte_mbuf **pkts, + uint16_t nb_pkts, + void *_ __rte_unused) +{ + unsigned int i, cnt = 0; + uint64_t now; + float latency[nb_pkts]; + static float prev_latency; + /* + * Alpha represents degree of weighting decrease in EWMA, + * a constant smoothing factor between 0 and 1. The value + * is used below for measuring average latency. + */ + const float alpha = 0.2; + + now = rte_rdtsc(); + for (i = 0; i < nb_pkts; i++) { + if (pkts[i]->timestamp) + latency[cnt++] = now - pkts[i]->timestamp; + } + + for (i = 0; i < cnt; i++) { + /* + * The jitter is calculated as statistical mean of interpacket + * delay variation. The "jitter estimate" is computed by taking + * the absolute values of the ipdv sequence and applying an + * exponential filter with parameter 1/16 to generate the + * estimate. i.e J=J+(|D(i-1,i)|-J)/16. Where J is jitter, + * D(i-1,i) is difference in latency of two consecutive packets + * i-1 and i. + * Reference: Calculated as per RFC 5481, sec 4.1, + * RFC 3393 sec 4.5, RFC 1889 sec. + */ + glob_stats->jitter += (fabsf(prev_latency - latency[i]) + - glob_stats->jitter)/16; + if (glob_stats->min_latency == 0) + glob_stats->min_latency = latency[i]; + else if (latency[i] < glob_stats->min_latency) + glob_stats->min_latency = latency[i]; + else if (latency[i] > glob_stats->max_latency) + glob_stats->max_latency = latency[i]; + /* + * The average latency is measured using exponential moving + * average, i.e. using EWMA + * https://en.wikipedia.org/wiki/Moving_average + */ + glob_stats->avg_latency += + alpha * (latency[i] - glob_stats->avg_latency); + prev_latency = latency[i]; + } + + return nb_pkts; +} + +int +rte_latencystats_init(uint64_t app_samp_intvl, + rte_latency_stats_flow_type_fn user_cb) +{ + unsigned int i; + uint16_t pid; + uint16_t qid; + struct rxtx_cbs *cbs = NULL; + const char *ptr_strings[NUM_LATENCY_STATS] = {0}; + const struct rte_memzone *mz = NULL; + const unsigned int flags = 0; + + if (rte_memzone_lookup(MZ_RTE_LATENCY_STATS)) + return -EEXIST; + + /** Allocate stats in shared memory fo multi process support */ + mz = rte_memzone_reserve(MZ_RTE_LATENCY_STATS, sizeof(*glob_stats), + rte_socket_id(), flags); + if (mz == NULL) { + RTE_LOG(ERR, LATENCY_STATS, "Cannot reserve memory: %s:%d\n", + __func__, __LINE__); + return -ENOMEM; + } + + glob_stats = mz->addr; + samp_intvl = app_samp_intvl * latencystat_cycles_per_ns(); + + /** Register latency stats with stats library */ + for (i = 0; i < NUM_LATENCY_STATS; i++) + ptr_strings[i] = lat_stats_strings[i].name; + + latency_stats_index = rte_metrics_reg_names(ptr_strings, + NUM_LATENCY_STATS); + if (latency_stats_index < 0) { + RTE_LOG(DEBUG, LATENCY_STATS, + "Failed to register latency stats names\n"); + return -1; + } + + /** Register Rx/Tx callbacks */ + RTE_ETH_FOREACH_DEV(pid) { + struct rte_eth_dev_info dev_info; + rte_eth_dev_info_get(pid, &dev_info); + for (qid = 0; qid < dev_info.nb_rx_queues; qid++) { + cbs = &rx_cbs[pid][qid]; + cbs->cb = rte_eth_add_first_rx_callback(pid, qid, + add_time_stamps, user_cb); + if (!cbs->cb) + RTE_LOG(INFO, LATENCY_STATS, "Failed to " + "register Rx callback for pid=%d, " + "qid=%d\n", pid, qid); + } + for (qid = 0; qid < dev_info.nb_tx_queues; qid++) { + cbs = &tx_cbs[pid][qid]; + cbs->cb = rte_eth_add_tx_callback(pid, qid, + calc_latency, user_cb); + if (!cbs->cb) + RTE_LOG(INFO, LATENCY_STATS, "Failed to " + "register Tx callback for pid=%d, " + "qid=%d\n", pid, qid); + } + } + return 0; +} + +int +rte_latencystats_uninit(void) +{ + uint16_t pid; + uint16_t qid; + int ret = 0; + struct rxtx_cbs *cbs = NULL; + const struct rte_memzone *mz = NULL; + + /** De register Rx/Tx callbacks */ + RTE_ETH_FOREACH_DEV(pid) { + struct rte_eth_dev_info dev_info; + rte_eth_dev_info_get(pid, &dev_info); + for (qid = 0; qid < dev_info.nb_rx_queues; qid++) { + cbs = &rx_cbs[pid][qid]; + ret = rte_eth_remove_rx_callback(pid, qid, cbs->cb); + if (ret) + RTE_LOG(INFO, LATENCY_STATS, "failed to " + "remove Rx callback for pid=%d, " + "qid=%d\n", pid, qid); + } + for (qid = 0; qid < dev_info.nb_tx_queues; qid++) { + cbs = &tx_cbs[pid][qid]; + ret = rte_eth_remove_tx_callback(pid, qid, cbs->cb); + if (ret) + RTE_LOG(INFO, LATENCY_STATS, "failed to " + "remove Tx callback for pid=%d, " + "qid=%d\n", pid, qid); + } + } + + /* free up the memzone */ + mz = rte_memzone_lookup(MZ_RTE_LATENCY_STATS); + if (mz) + rte_memzone_free(mz); + + return 0; +} + +int +rte_latencystats_get_names(struct rte_metric_name *names, uint16_t size) +{ + unsigned int i; + + if (names == NULL || size < NUM_LATENCY_STATS) + return NUM_LATENCY_STATS; + + for (i = 0; i < NUM_LATENCY_STATS; i++) + snprintf(names[i].name, sizeof(names[i].name), + "%s", lat_stats_strings[i].name); + + return NUM_LATENCY_STATS; +} + +int +rte_latencystats_get(struct rte_metric_value *values, uint16_t size) +{ + if (size < NUM_LATENCY_STATS || values == NULL) + return NUM_LATENCY_STATS; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + const struct rte_memzone *mz; + mz = rte_memzone_lookup(MZ_RTE_LATENCY_STATS); + if (mz == NULL) { + RTE_LOG(ERR, LATENCY_STATS, + "Latency stats memzone not found\n"); + return -ENOMEM; + } + glob_stats = mz->addr; + } + + /* Retrieve latency stats */ + rte_latencystats_fill_values(values); + + return NUM_LATENCY_STATS; +} diff --git a/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.h b/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.h new file mode 100644 index 00000000..efcfa028 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_LATENCYSTATS_H_ +#define _RTE_LATENCYSTATS_H_ + +/** + * @file + * RTE latency stats + * + * library to provide application and flow based latency stats. + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Note: This function pointer is for future flow based latency stats + * implementation. + * + * Function type used for identifting flow types of a Rx packet. + * + * The callback function is called on Rx for each packet. + * This function is used for flow based latency calculations. + * + * @param pkt + * Packet that has to be identified with its flow types. + * @param user_param + * The arbitrary user parameter passed in by the application when + * the callback was originally configured. + * @return + * The flow_mask, representing the multiple flow types of a packet. + */ +typedef uint16_t (*rte_latency_stats_flow_type_fn)(struct rte_mbuf *pkt, + void *user_param); + +/** + * Registers Rx/Tx callbacks for each active port, queue. + * + * @param samp_intvl + * Sampling time period in nano seconds, at which packet + * should be marked with time stamp. + * @param user_cb + * Note: This param is for future flow based latency stats + * implementation. + * User callback to be called to get flow types of a packet. + * Used for flow based latency calculation. + * If the value is NULL, global stats will be calculated, + * else flow based latency stats will be calculated. + * For now just pass on the NULL value to this param. + * @return + * -1 : On error + * -ENOMEM: On error + * 0 : On success + */ +int rte_latencystats_init(uint64_t samp_intvl, + rte_latency_stats_flow_type_fn user_cb); + +/** + * Calculates the latency and jitter values internally, exposing the updated + * values via *rte_latencystats_get* or the rte_metrics API. + * @return: + * 0 : on Success + * < 0 : Error in updating values. + */ +int32_t rte_latencystats_update(void); + +/** + * Removes registered Rx/Tx callbacks for each active port, queue. + * + * @return + * -1: On error + * 0: On success + */ +int rte_latencystats_uninit(void); + +/** + * Retrieve names of latency statistics + * + * @param names + * Block of memory to insert names into. Must be at least size in capacity. + * If set to NULL, function returns required capacity. + * @param size + * Capacity of latency stats names (number of names). + * @return + * - positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + */ +int rte_latencystats_get_names(struct rte_metric_name *names, + uint16_t size); + +/** + * Retrieve latency statistics. + * + * @param values + * A pointer to a table of structure of type *rte_metric_value* + * to be filled with latency statistics ids and values. + * This parameter can be set to NULL if size is 0. + * @param size + * The size of the stats table, which should be large enough to store + * all the latency stats. + * @return + * - positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * -ENOMEM: On failure. + */ +int rte_latencystats_get(struct rte_metric_value *values, + uint16_t size); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LATENCYSTATS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats_version.map b/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats_version.map new file mode 100644 index 00000000..ac8403e8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_latencystats/rte_latencystats_version.map @@ -0,0 +1,11 @@ +DPDK_17.05 { + global: + + rte_latencystats_get; + rte_latencystats_get_names; + rte_latencystats_init; + rte_latencystats_uninit; + rte_latencystats_update; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_lpm/Makefile b/src/spdk/dpdk/lib/librte_lpm/Makefile new file mode 100644 index 00000000..482bd72e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_lpm.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_lpm_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_LPM) := rte_lpm.c rte_lpm6.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include := rte_lpm.h rte_lpm6.h + +ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),) +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_neon.h +else ifeq ($(CONFIG_RTE_ARCH_X86),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_sse.h +else ifeq ($(CONFIG_RTE_ARCH_PPC_64),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_altivec.h +endif + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_lpm/meson.build b/src/spdk/dpdk/lib/librte_lpm/meson.build new file mode 100644 index 00000000..06784942 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('rte_lpm.c', 'rte_lpm6.c') +headers = files('rte_lpm.h', 'rte_lpm6.h') +# since header files have different names, we can install all vector headers +# without worrying about which architecture we actually need +headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h') diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm.c b/src/spdk/dpdk/lib/librte_lpm/rte_lpm.c new file mode 100644 index 00000000..d00b13d9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm.c @@ -0,0 +1,1887 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_lpm.h" + +TAILQ_HEAD(rte_lpm_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm_tailq = { + .name = "RTE_LPM", +}; +EAL_REGISTER_TAILQ(rte_lpm_tailq) + +#define MAX_DEPTH_TBL24 24 + +enum valid_flag { + INVALID = 0, + VALID +}; + +/* Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#include +#define VERIFY_DEPTH(depth) do { \ + if ((depth == 0) || (depth > RTE_LPM_MAX_DEPTH)) \ + rte_panic("LPM: Invalid depth (%u) at line %d", \ + (unsigned)(depth), __LINE__); \ +} while (0) +#else +#define VERIFY_DEPTH(depth) +#endif + +/* + * Converts a given depth value to its corresponding mask value. + * + * depth (IN) : range = 1 - 32 + * mask (OUT) : 32bit mask + */ +static uint32_t __attribute__((pure)) +depth_to_mask(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* To calculate a mask start with a 1 on the left hand side and right + * shift while populating the left hand side with 1's + */ + return (int)0x80000000 >> (depth - 1); +} + +/* + * Converts given depth value to its corresponding range value. + */ +static inline uint32_t __attribute__((pure)) +depth_to_range(uint8_t depth) +{ + VERIFY_DEPTH(depth); + + /* + * Calculate tbl24 range. (Note: 2^depth = 1 << depth) + */ + if (depth <= MAX_DEPTH_TBL24) + return 1 << (MAX_DEPTH_TBL24 - depth); + + /* Else if depth is greater than 24 */ + return 1 << (RTE_LPM_MAX_DEPTH - depth); +} + +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm_v20 * +rte_lpm_find_existing_v20(const char *name) +{ + struct rte_lpm_v20 *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, lpm_list, next) { + l = te->data; + if (strncmp(name, l->name, RTE_LPM_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +VERSION_SYMBOL(rte_lpm_find_existing, _v20, 2.0); + +struct rte_lpm * +rte_lpm_find_existing_v1604(const char *name) +{ + struct rte_lpm *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, lpm_list, next) { + l = te->data; + if (strncmp(name, l->name, RTE_LPM_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} +BIND_DEFAULT_SYMBOL(rte_lpm_find_existing, _v1604, 16.04); +MAP_STATIC_SYMBOL(struct rte_lpm *rte_lpm_find_existing(const char *name), + rte_lpm_find_existing_v1604); + +/* + * Allocates memory for LPM object + */ +struct rte_lpm_v20 * +rte_lpm_create_v20(const char *name, int socket_id, int max_rules, + __rte_unused int flags) +{ + char mem_name[RTE_LPM_NAMESIZE]; + struct rte_lpm_v20 *lpm = NULL; + struct rte_tailq_entry *te; + uint32_t mem_size; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl_entry_v20) != 2); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (max_rules == 0)) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm) + (sizeof(lpm->rules_tbl[0]) * max_rules); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = te->data; + if (strncmp(name, lpm->name, RTE_LPM_NAMESIZE) == 0) + break; + } + + if (te != NULL) { + lpm = NULL; + rte_errno = EEXIST; + goto exit; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry\n"); + rte_errno = ENOMEM; + goto exit; + } + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + /* Save user arguments. */ + lpm->max_rules = max_rules; + snprintf(lpm->name, sizeof(lpm->name), "%s", name); + + te->data = lpm; + + TAILQ_INSERT_TAIL(lpm_list, te, next); + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return lpm; +} +VERSION_SYMBOL(rte_lpm_create, _v20, 2.0); + +struct rte_lpm * +rte_lpm_create_v1604(const char *name, int socket_id, + const struct rte_lpm_config *config) +{ + char mem_name[RTE_LPM_NAMESIZE]; + struct rte_lpm *lpm = NULL; + struct rte_tailq_entry *te; + uint32_t mem_size, rules_size, tbl8s_size; + struct rte_lpm_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl_entry) != 4); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (config->max_rules == 0) + || config->number_tbl8s > RTE_LPM_MAX_TBL8_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm); + rules_size = sizeof(struct rte_lpm_rule) * config->max_rules; + tbl8s_size = (sizeof(struct rte_lpm_tbl_entry) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = te->data; + if (strncmp(name, lpm->name, RTE_LPM_NAMESIZE) == 0) + break; + } + + if (te != NULL) { + lpm = NULL; + rte_errno = EEXIST; + goto exit; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry\n"); + rte_errno = ENOMEM; + goto exit; + } + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->rules_tbl = rte_zmalloc_socket(NULL, + (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules_tbl memory allocation failed\n"); + rte_free(lpm); + lpm = NULL; + rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->tbl8 = rte_zmalloc_socket(NULL, + (size_t)tbl8s_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->tbl8 == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 memory allocation failed\n"); + rte_free(lpm->rules_tbl); + rte_free(lpm); + lpm = NULL; + rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + /* Save user arguments. */ + lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + snprintf(lpm->name, sizeof(lpm->name), "%s", name); + + te->data = lpm; + + TAILQ_INSERT_TAIL(lpm_list, te, next); + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return lpm; +} +BIND_DEFAULT_SYMBOL(rte_lpm_create, _v1604, 16.04); +MAP_STATIC_SYMBOL( + struct rte_lpm *rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config), rte_lpm_create_v1604); + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm_free_v20(struct rte_lpm_v20 *lpm) +{ + struct rte_lpm_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(lpm); + rte_free(te); +} +VERSION_SYMBOL(rte_lpm_free, _v20, 2.0); + +void +rte_lpm_free_v1604(struct rte_lpm *lpm) +{ + struct rte_lpm_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(lpm->tbl8); + rte_free(lpm->rules_tbl); + rte_free(lpm); + rte_free(te); +} +BIND_DEFAULT_SYMBOL(rte_lpm_free, _v1604, 16.04); +MAP_STATIC_SYMBOL(void rte_lpm_free(struct rte_lpm *lpm), + rte_lpm_free_v1604); + +/* + * Adds a rule to the rule table. + * + * NOTE: The rule table is split into 32 groups. Each group contains rules that + * apply to a specific prefix depth (i.e. group 1 contains rules that apply to + * prefixes with a depth of 1 etc.). In the following code (depth - 1) is used + * to refer to depth 1 because even though the depth range is 1 - 32, depths + * are stored in the rule table from 0 - 31. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static inline int32_t +rule_add_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth, + uint8_t next_hop) +{ + uint32_t rule_gindex, rule_index, last_rule; + int i; + + VERIFY_DEPTH(depth); + + /* Scan through rule group to see if rule already exists. */ + if (lpm->rule_info[depth - 1].used_rules > 0) { + + /* rule_gindex stands for rule group index. */ + rule_gindex = lpm->rule_info[depth - 1].first_rule; + /* Initialise rule_index to point to start of rule group. */ + rule_index = rule_gindex; + /* Last rule = Last used rule in this rule group. */ + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + for (; rule_index < last_rule; rule_index++) { + + /* If rule already exists update its next_hop and return. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) { + lpm->rules_tbl[rule_index].next_hop = next_hop; + + return rule_index; + } + } + + if (rule_index == lpm->max_rules) + return -ENOSPC; + } else { + /* Calculate the position in which the rule will be stored. */ + rule_index = 0; + + for (i = depth - 1; i > 0; i--) { + if (lpm->rule_info[i - 1].used_rules > 0) { + rule_index = lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules; + break; + } + } + if (rule_index == lpm->max_rules) + return -ENOSPC; + + lpm->rule_info[depth - 1].first_rule = rule_index; + } + + /* Make room for the new rule in the array. */ + for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) { + if (lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules == lpm->max_rules) + return -ENOSPC; + + if (lpm->rule_info[i - 1].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules] + = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; + lpm->rule_info[i - 1].first_rule++; + } + } + + /* Add the new rule. */ + lpm->rules_tbl[rule_index].ip = ip_masked; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + /* Increment the used rules counter for this rule group. */ + lpm->rule_info[depth - 1].used_rules++; + + return rule_index; +} + +static inline int32_t +rule_add_v1604(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ + uint32_t rule_gindex, rule_index, last_rule; + int i; + + VERIFY_DEPTH(depth); + + /* Scan through rule group to see if rule already exists. */ + if (lpm->rule_info[depth - 1].used_rules > 0) { + + /* rule_gindex stands for rule group index. */ + rule_gindex = lpm->rule_info[depth - 1].first_rule; + /* Initialise rule_index to point to start of rule group. */ + rule_index = rule_gindex; + /* Last rule = Last used rule in this rule group. */ + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + for (; rule_index < last_rule; rule_index++) { + + /* If rule already exists update its next_hop and return. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) { + lpm->rules_tbl[rule_index].next_hop = next_hop; + + return rule_index; + } + } + + if (rule_index == lpm->max_rules) + return -ENOSPC; + } else { + /* Calculate the position in which the rule will be stored. */ + rule_index = 0; + + for (i = depth - 1; i > 0; i--) { + if (lpm->rule_info[i - 1].used_rules > 0) { + rule_index = lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules; + break; + } + } + if (rule_index == lpm->max_rules) + return -ENOSPC; + + lpm->rule_info[depth - 1].first_rule = rule_index; + } + + /* Make room for the new rule in the array. */ + for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) { + if (lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules == lpm->max_rules) + return -ENOSPC; + + if (lpm->rule_info[i - 1].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + + lpm->rule_info[i - 1].used_rules] + = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; + lpm->rule_info[i - 1].first_rule++; + } + } + + /* Add the new rule. */ + lpm->rules_tbl[rule_index].ip = ip_masked; + lpm->rules_tbl[rule_index].next_hop = next_hop; + + /* Increment the used rules counter for this rule group. */ + lpm->rule_info[depth - 1].used_rules++; + + return rule_index; +} + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static inline void +rule_delete_v20(struct rte_lpm_v20 *lpm, int32_t rule_index, uint8_t depth) +{ + int i; + + VERIFY_DEPTH(depth); + + lpm->rules_tbl[rule_index] = + lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule + + lpm->rule_info[depth - 1].used_rules - 1]; + + for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) { + if (lpm->rule_info[i].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] = + lpm->rules_tbl[lpm->rule_info[i].first_rule + + lpm->rule_info[i].used_rules - 1]; + lpm->rule_info[i].first_rule--; + } + } + + lpm->rule_info[depth - 1].used_rules--; +} + +static inline void +rule_delete_v1604(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth) +{ + int i; + + VERIFY_DEPTH(depth); + + lpm->rules_tbl[rule_index] = + lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule + + lpm->rule_info[depth - 1].used_rules - 1]; + + for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) { + if (lpm->rule_info[i].used_rules > 0) { + lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] = + lpm->rules_tbl[lpm->rule_info[i].first_rule + + lpm->rule_info[i].used_rules - 1]; + lpm->rule_info[i].first_rule--; + } + } + + lpm->rule_info[depth - 1].used_rules--; +} + +/* + * Finds a rule in rule table. + * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + */ +static inline int32_t +rule_find_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth) +{ + uint32_t rule_gindex, last_rule, rule_index; + + VERIFY_DEPTH(depth); + + rule_gindex = lpm->rule_info[depth - 1].first_rule; + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + /* Scan used rules at given depth to find rule. */ + for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) { + /* If rule is found return the rule index. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) + return rule_index; + } + + /* If rule is not found return -EINVAL. */ + return -EINVAL; +} + +static inline int32_t +rule_find_v1604(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +{ + uint32_t rule_gindex, last_rule, rule_index; + + VERIFY_DEPTH(depth); + + rule_gindex = lpm->rule_info[depth - 1].first_rule; + last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + + /* Scan used rules at given depth to find rule. */ + for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) { + /* If rule is found return the rule index. */ + if (lpm->rules_tbl[rule_index].ip == ip_masked) + return rule_index; + } + + /* If rule is not found return -EINVAL. */ + return -EINVAL; +} + +/* + * Find, clean and allocate a tbl8. + */ +static inline int32_t +tbl8_alloc_v20(struct rte_lpm_tbl_entry_v20 *tbl8) +{ + uint32_t group_idx; /* tbl8 group index. */ + struct rte_lpm_tbl_entry_v20 *tbl8_entry; + + /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ + for (group_idx = 0; group_idx < RTE_LPM_TBL8_NUM_GROUPS; + group_idx++) { + tbl8_entry = &tbl8[group_idx * RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + /* If a free tbl8 group is found clean it and set as VALID. */ + if (!tbl8_entry->valid_group) { + memset(&tbl8_entry[0], 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * + sizeof(tbl8_entry[0])); + + tbl8_entry->valid_group = VALID; + + /* Return group index for allocated tbl8 group. */ + return group_idx; + } + } + + /* If there are no tbl8 groups free then return error. */ + return -ENOSPC; +} + +static inline int32_t +tbl8_alloc_v1604(struct rte_lpm_tbl_entry *tbl8, uint32_t number_tbl8s) +{ + uint32_t group_idx; /* tbl8 group index. */ + struct rte_lpm_tbl_entry *tbl8_entry; + + /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ + for (group_idx = 0; group_idx < number_tbl8s; group_idx++) { + tbl8_entry = &tbl8[group_idx * RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + /* If a free tbl8 group is found clean it and set as VALID. */ + if (!tbl8_entry->valid_group) { + memset(&tbl8_entry[0], 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * + sizeof(tbl8_entry[0])); + + tbl8_entry->valid_group = VALID; + + /* Return group index for allocated tbl8 group. */ + return group_idx; + } + } + + /* If there are no tbl8 groups free then return error. */ + return -ENOSPC; +} + +static inline void +tbl8_free_v20(struct rte_lpm_tbl_entry_v20 *tbl8, uint32_t tbl8_group_start) +{ + /* Set tbl8 group invalid*/ + tbl8[tbl8_group_start].valid_group = INVALID; +} + +static inline void +tbl8_free_v1604(struct rte_lpm_tbl_entry *tbl8, uint32_t tbl8_group_start) +{ + /* Set tbl8 group invalid*/ + tbl8[tbl8_group_start].valid_group = INVALID; +} + +static inline int32_t +add_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth, + uint8_t next_hop) +{ + uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + + /* Calculate the index into Table24. */ + tbl24_index = ip >> 8; + tbl24_range = depth_to_range(depth); + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + /* + * For invalid OR valid and non-extended tbl 24 entries set + * entry. + */ + if (!lpm->tbl24[i].valid || (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth)) { + + struct rte_lpm_tbl_entry_v20 new_tbl24_entry = { + .valid = VALID, + .valid_group = 0, + .depth = depth, + }; + new_tbl24_entry.next_hop = next_hop; + + /* Setting tbl24 entry in one go to avoid race + * conditions + */ + lpm->tbl24[i] = new_tbl24_entry; + + continue; + } + + if (lpm->tbl24[i].valid_group == 1) { + /* If tbl24 entry is valid and extended calculate the + * index into tbl8. + */ + tbl8_index = lpm->tbl24[i].group_idx * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) { + struct rte_lpm_tbl_entry_v20 + new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = depth, + }; + new_tbl8_entry.next_hop = next_hop; + + /* + * Setting tbl8 entry in one go to avoid + * race conditions + */ + lpm->tbl8[j] = new_tbl8_entry; + + continue; + } + } + } + } + + return 0; +} + +static inline int32_t +add_depth_small_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + + /* Calculate the index into Table24. */ + tbl24_index = ip >> 8; + tbl24_range = depth_to_range(depth); + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + /* + * For invalid OR valid and non-extended tbl 24 entries set + * entry. + */ + if (!lpm->tbl24[i].valid || (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth)) { + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = next_hop, + .valid = VALID, + .valid_group = 0, + .depth = depth, + }; + + /* Setting tbl24 entry in one go to avoid race + * conditions + */ + lpm->tbl24[i] = new_tbl24_entry; + + continue; + } + + if (lpm->tbl24[i].valid_group == 1) { + /* If tbl24 entry is valid and extended calculate the + * index into tbl8. + */ + tbl8_index = lpm->tbl24[i].group_idx * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) { + struct rte_lpm_tbl_entry + new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = depth, + .next_hop = next_hop, + }; + + /* + * Setting tbl8 entry in one go to avoid + * race conditions + */ + lpm->tbl8[j] = new_tbl8_entry; + + continue; + } + } + } + } +#undef group_idx + return 0; +} + +static inline int32_t +add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth, + uint8_t next_hop) +{ + uint32_t tbl24_index; + int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, + tbl8_range, i; + + tbl24_index = (ip_masked >> 8); + tbl8_range = depth_to_range(depth); + + if (!lpm->tbl24[tbl24_index].valid) { + /* Search for a free tbl8 group. */ + tbl8_group_index = tbl8_alloc_v20(lpm->tbl8); + + /* Check tbl8 allocation was successful. */ + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + /* Find index into tbl8 and range. */ + tbl8_index = (tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + + (ip_masked & 0xFF); + + /* Set tbl8 entry. */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + lpm->tbl8[i].depth = depth; + lpm->tbl8[i].next_hop = next_hop; + lpm->tbl8[i].valid = VALID; + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go + */ + + struct rte_lpm_tbl_entry_v20 new_tbl24_entry = { + .group_idx = (uint8_t)tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + lpm->tbl24[tbl24_index] = new_tbl24_entry; + + } /* If valid entry but not extended calculate the index into Table8. */ + else if (lpm->tbl24[tbl24_index].valid_group == 0) { + /* Search for free tbl8 group. */ + tbl8_group_index = tbl8_alloc_v20(lpm->tbl8); + + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* Populate new tbl8 with tbl24 value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) { + lpm->tbl8[i].valid = VALID; + lpm->tbl8[i].depth = lpm->tbl24[tbl24_index].depth; + lpm->tbl8[i].next_hop = + lpm->tbl24[tbl24_index].next_hop; + } + + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + /* Insert new rule into the tbl8 entry. */ + for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) { + lpm->tbl8[i].valid = VALID; + lpm->tbl8[i].depth = depth; + lpm->tbl8[i].next_hop = next_hop; + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + + struct rte_lpm_tbl_entry_v20 new_tbl24_entry = { + .group_idx = (uint8_t)tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + lpm->tbl24[tbl24_index] = new_tbl24_entry; + + } else { /* + * If it is valid, extended entry calculate the index into tbl8. + */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + + if (!lpm->tbl8[i].valid || + lpm->tbl8[i].depth <= depth) { + struct rte_lpm_tbl_entry_v20 new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .valid_group = lpm->tbl8[i].valid_group, + }; + new_tbl8_entry.next_hop = next_hop; + /* + * Setting tbl8 entry in one go to avoid race + * condition + */ + lpm->tbl8[i] = new_tbl8_entry; + + continue; + } + } + } + + return 0; +} + +static inline int32_t +add_depth_big_v1604(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint32_t next_hop) +{ +#define group_idx next_hop + uint32_t tbl24_index; + int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, + tbl8_range, i; + + tbl24_index = (ip_masked >> 8); + tbl8_range = depth_to_range(depth); + + if (!lpm->tbl24[tbl24_index].valid) { + /* Search for a free tbl8 group. */ + tbl8_group_index = tbl8_alloc_v1604(lpm->tbl8, lpm->number_tbl8s); + + /* Check tbl8 allocation was successful. */ + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + /* Find index into tbl8 and range. */ + tbl8_index = (tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + + (ip_masked & 0xFF); + + /* Set tbl8 entry. */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + lpm->tbl8[i].depth = depth; + lpm->tbl8[i].next_hop = next_hop; + lpm->tbl8[i].valid = VALID; + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + lpm->tbl24[tbl24_index] = new_tbl24_entry; + + } /* If valid entry but not extended calculate the index into Table8. */ + else if (lpm->tbl24[tbl24_index].valid_group == 0) { + /* Search for free tbl8 group. */ + tbl8_group_index = tbl8_alloc_v1604(lpm->tbl8, lpm->number_tbl8s); + + if (tbl8_group_index < 0) { + return tbl8_group_index; + } + + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* Populate new tbl8 with tbl24 value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) { + lpm->tbl8[i].valid = VALID; + lpm->tbl8[i].depth = lpm->tbl24[tbl24_index].depth; + lpm->tbl8[i].next_hop = + lpm->tbl24[tbl24_index].next_hop; + } + + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + /* Insert new rule into the tbl8 entry. */ + for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) { + lpm->tbl8[i].valid = VALID; + lpm->tbl8[i].depth = depth; + lpm->tbl8[i].next_hop = next_hop; + } + + /* + * Update tbl24 entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .group_idx = tbl8_group_index, + .valid = VALID, + .valid_group = 1, + .depth = 0, + }; + + lpm->tbl24[tbl24_index] = new_tbl24_entry; + + } else { /* + * If it is valid, extended entry calculate the index into tbl8. + */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + + if (!lpm->tbl8[i].valid || + lpm->tbl8[i].depth <= depth) { + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + .valid_group = lpm->tbl8[i].valid_group, + }; + + /* + * Setting tbl8 entry in one go to avoid race + * condition + */ + lpm->tbl8[i] = new_tbl8_entry; + + continue; + } + } + } +#undef group_idx + return 0; +} + +/* + * Add a route + */ +int +rte_lpm_add_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth, + uint8_t next_hop) +{ + int32_t rule_index, status = 0; + uint32_t ip_masked; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + ip_masked = ip & depth_to_mask(depth); + + /* Add the rule to the rule table. */ + rule_index = rule_add_v20(lpm, ip_masked, depth, next_hop); + + /* If the is no space available for new rule return error. */ + if (rule_index < 0) { + return rule_index; + } + + if (depth <= MAX_DEPTH_TBL24) { + status = add_depth_small_v20(lpm, ip_masked, depth, next_hop); + } else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ + status = add_depth_big_v20(lpm, ip_masked, depth, next_hop); + + /* + * If add fails due to exhaustion of tbl8 extensions delete + * rule that was added to rule table. + */ + if (status < 0) { + rule_delete_v20(lpm, rule_index, depth); + + return status; + } + } + + return 0; +} +VERSION_SYMBOL(rte_lpm_add, _v20, 2.0); + +int +rte_lpm_add_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop) +{ + int32_t rule_index, status = 0; + uint32_t ip_masked; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + ip_masked = ip & depth_to_mask(depth); + + /* Add the rule to the rule table. */ + rule_index = rule_add_v1604(lpm, ip_masked, depth, next_hop); + + /* If the is no space available for new rule return error. */ + if (rule_index < 0) { + return rule_index; + } + + if (depth <= MAX_DEPTH_TBL24) { + status = add_depth_small_v1604(lpm, ip_masked, depth, next_hop); + } else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ + status = add_depth_big_v1604(lpm, ip_masked, depth, next_hop); + + /* + * If add fails due to exhaustion of tbl8 extensions delete + * rule that was added to rule table. + */ + if (status < 0) { + rule_delete_v1604(lpm, rule_index, depth); + + return status; + } + } + + return 0; +} +BIND_DEFAULT_SYMBOL(rte_lpm_add, _v1604, 16.04); +MAP_STATIC_SYMBOL(int rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, + uint8_t depth, uint32_t next_hop), rte_lpm_add_v1604); + +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm_is_rule_present_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth, +uint8_t *next_hop) +{ + uint32_t ip_masked; + int32_t rule_index; + + /* Check user arguments. */ + if ((lpm == NULL) || + (next_hop == NULL) || + (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + /* Look for the rule using rule_find. */ + ip_masked = ip & depth_to_mask(depth); + rule_index = rule_find_v20(lpm, ip_masked, depth); + + if (rule_index >= 0) { + *next_hop = lpm->rules_tbl[rule_index].next_hop; + return 1; + } + + /* If rule is not found return 0. */ + return 0; +} +VERSION_SYMBOL(rte_lpm_is_rule_present, _v20, 2.0); + +int +rte_lpm_is_rule_present_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop) +{ + uint32_t ip_masked; + int32_t rule_index; + + /* Check user arguments. */ + if ((lpm == NULL) || + (next_hop == NULL) || + (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + return -EINVAL; + + /* Look for the rule using rule_find. */ + ip_masked = ip & depth_to_mask(depth); + rule_index = rule_find_v1604(lpm, ip_masked, depth); + + if (rule_index >= 0) { + *next_hop = lpm->rules_tbl[rule_index].next_hop; + return 1; + } + + /* If rule is not found return 0. */ + return 0; +} +BIND_DEFAULT_SYMBOL(rte_lpm_is_rule_present, _v1604, 16.04); +MAP_STATIC_SYMBOL(int rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, + uint8_t depth, uint32_t *next_hop), rte_lpm_is_rule_present_v1604); + +static inline int32_t +find_previous_rule_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) +{ + int32_t rule_index; + uint32_t ip_masked; + uint8_t prev_depth; + + for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + ip_masked = ip & depth_to_mask(prev_depth); + + rule_index = rule_find_v20(lpm, ip_masked, prev_depth); + + if (rule_index >= 0) { + *sub_rule_depth = prev_depth; + return rule_index; + } + } + + return -1; +} + +static inline int32_t +find_previous_rule_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) +{ + int32_t rule_index; + uint32_t ip_masked; + uint8_t prev_depth; + + for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + ip_masked = ip & depth_to_mask(prev_depth); + + rule_index = rule_find_v1604(lpm, ip_masked, prev_depth); + + if (rule_index >= 0) { + *sub_rule_depth = prev_depth; + return rule_index; + } + } + + return -1; +} + +static inline int32_t +delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, + uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +{ + uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; + + /* Calculate the range and index into Table24. */ + tbl24_range = depth_to_range(depth); + tbl24_index = (ip_masked >> 8); + + /* + * Firstly check the sub_rule_index. A -1 indicates no replacement rule + * and a positive number indicates a sub_rule_index. + */ + if (sub_rule_index < 0) { + /* + * If no replacement rule exists then invalidate entries + * associated with this rule. + */ + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + lpm->tbl24[i].valid = INVALID; + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j].valid = INVALID; + } + } + } + } else { + /* + * If a replacement rule exists then modify entries + * associated with this rule. + */ + + struct rte_lpm_tbl_entry_v20 new_tbl24_entry = { + .next_hop = lpm->rules_tbl[sub_rule_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = sub_rule_depth, + }; + + struct rte_lpm_tbl_entry_v20 new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = sub_rule_depth, + }; + new_tbl8_entry.next_hop = + lpm->rules_tbl[sub_rule_index].next_hop; + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + lpm->tbl24[i] = new_tbl24_entry; + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j] = new_tbl8_entry; + } + } + } + } + + return 0; +} + +static inline int32_t +delete_depth_small_v1604(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; + + /* Calculate the range and index into Table24. */ + tbl24_range = depth_to_range(depth); + tbl24_index = (ip_masked >> 8); + + /* + * Firstly check the sub_rule_index. A -1 indicates no replacement rule + * and a positive number indicates a sub_rule_index. + */ + if (sub_rule_index < 0) { + /* + * If no replacement rule exists then invalidate entries + * associated with this rule. + */ + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + lpm->tbl24[i].valid = INVALID; + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j].valid = INVALID; + } + } + } + } else { + /* + * If a replacement rule exists then modify entries + * associated with this rule. + */ + + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = lpm->rules_tbl[sub_rule_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = sub_rule_depth, + }; + + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = sub_rule_depth, + .next_hop = lpm->rules_tbl + [sub_rule_index].next_hop, + }; + + for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { + + if (lpm->tbl24[i].valid_group == 0 && + lpm->tbl24[i].depth <= depth) { + lpm->tbl24[i] = new_tbl24_entry; + } else if (lpm->tbl24[i].valid_group == 1) { + /* + * If TBL24 entry is extended, then there has + * to be a rule with depth >= 25 in the + * associated TBL8 group. + */ + + tbl8_group_index = lpm->tbl24[i].group_idx; + tbl8_index = tbl8_group_index * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + for (j = tbl8_index; j < (tbl8_index + + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { + + if (lpm->tbl8[j].depth <= depth) + lpm->tbl8[j] = new_tbl8_entry; + } + } + } + } +#undef group_idx + return 0; +} + +/* + * Checks if table 8 group can be recycled. + * + * Return of -EEXIST means tbl8 is in use and thus can not be recycled. + * Return of -EINVAL means tbl8 is empty and thus can be recycled + * Return of value > -1 means tbl8 is in use but has all the same values and + * thus can be recycled + */ +static inline int32_t +tbl8_recycle_check_v20(struct rte_lpm_tbl_entry_v20 *tbl8, + uint32_t tbl8_group_start) +{ + uint32_t tbl8_group_end, i; + tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* + * Check the first entry of the given tbl8. If it is invalid we know + * this tbl8 does not contain any rule with a depth < RTE_LPM_MAX_DEPTH + * (As they would affect all entries in a tbl8) and thus this table + * can not be recycled. + */ + if (tbl8[tbl8_group_start].valid) { + /* + * If first entry is valid check if the depth is less than 24 + * and if so check the rest of the entries to verify that they + * are all of this depth. + */ + if (tbl8[tbl8_group_start].depth <= MAX_DEPTH_TBL24) { + for (i = (tbl8_group_start + 1); i < tbl8_group_end; + i++) { + + if (tbl8[i].depth != + tbl8[tbl8_group_start].depth) { + + return -EEXIST; + } + } + /* If all entries are the same return the tb8 index */ + return tbl8_group_start; + } + + return -EEXIST; + } + /* + * If the first entry is invalid check if the rest of the entries in + * the tbl8 are invalid. + */ + for (i = (tbl8_group_start + 1); i < tbl8_group_end; i++) { + if (tbl8[i].valid) + return -EEXIST; + } + /* If no valid entries are found then return -EINVAL. */ + return -EINVAL; +} + +static inline int32_t +tbl8_recycle_check_v1604(struct rte_lpm_tbl_entry *tbl8, + uint32_t tbl8_group_start) +{ + uint32_t tbl8_group_end, i; + tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + + /* + * Check the first entry of the given tbl8. If it is invalid we know + * this tbl8 does not contain any rule with a depth < RTE_LPM_MAX_DEPTH + * (As they would affect all entries in a tbl8) and thus this table + * can not be recycled. + */ + if (tbl8[tbl8_group_start].valid) { + /* + * If first entry is valid check if the depth is less than 24 + * and if so check the rest of the entries to verify that they + * are all of this depth. + */ + if (tbl8[tbl8_group_start].depth <= MAX_DEPTH_TBL24) { + for (i = (tbl8_group_start + 1); i < tbl8_group_end; + i++) { + + if (tbl8[i].depth != + tbl8[tbl8_group_start].depth) { + + return -EEXIST; + } + } + /* If all entries are the same return the tb8 index */ + return tbl8_group_start; + } + + return -EEXIST; + } + /* + * If the first entry is invalid check if the rest of the entries in + * the tbl8 are invalid. + */ + for (i = (tbl8_group_start + 1); i < tbl8_group_end; i++) { + if (tbl8[i].valid) + return -EEXIST; + } + /* If no valid entries are found then return -EINVAL. */ + return -EINVAL; +} + +static inline int32_t +delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, + uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +{ + uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, + tbl8_range, i; + int32_t tbl8_recycle_index; + + /* + * Calculate the index into tbl24 and range. Note: All depths larger + * than MAX_DEPTH_TBL24 are associated with only one tbl24 entry. + */ + tbl24_index = ip_masked >> 8; + + /* Calculate the index into tbl8 and range. */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + tbl8_range = depth_to_range(depth); + + if (sub_rule_index < 0) { + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be removed or modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i].valid = INVALID; + } + } else { + /* Set new tbl8 entry. */ + struct rte_lpm_tbl_entry_v20 new_tbl8_entry = { + .valid = VALID, + .depth = sub_rule_depth, + .valid_group = lpm->tbl8[tbl8_group_start].valid_group, + }; + + new_tbl8_entry.next_hop = + lpm->rules_tbl[sub_rule_index].next_hop; + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i] = new_tbl8_entry; + } + } + + /* + * Check if there are any valid entries in this tbl8 group. If all + * tbl8 entries are invalid we can free the tbl8 and invalidate the + * associated tbl24 entry. + */ + + tbl8_recycle_index = tbl8_recycle_check_v20(lpm->tbl8, tbl8_group_start); + + if (tbl8_recycle_index == -EINVAL) { + /* Set tbl24 before freeing tbl8 to avoid race condition. */ + lpm->tbl24[tbl24_index].valid = 0; + tbl8_free_v20(lpm->tbl8, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { + /* Update tbl24 entry. */ + struct rte_lpm_tbl_entry_v20 new_tbl24_entry = { + .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = lpm->tbl8[tbl8_recycle_index].depth, + }; + + /* Set tbl24 before freeing tbl8 to avoid race condition. */ + lpm->tbl24[tbl24_index] = new_tbl24_entry; + tbl8_free_v20(lpm->tbl8, tbl8_group_start); + } + + return 0; +} + +static inline int32_t +delete_depth_big_v1604(struct rte_lpm *lpm, uint32_t ip_masked, + uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +{ +#define group_idx next_hop + uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, + tbl8_range, i; + int32_t tbl8_recycle_index; + + /* + * Calculate the index into tbl24 and range. Note: All depths larger + * than MAX_DEPTH_TBL24 are associated with only one tbl24 entry. + */ + tbl24_index = ip_masked >> 8; + + /* Calculate the index into tbl8 and range. */ + tbl8_group_index = lpm->tbl24[tbl24_index].group_idx; + tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_index = tbl8_group_start + (ip_masked & 0xFF); + tbl8_range = depth_to_range(depth); + + if (sub_rule_index < 0) { + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be removed or modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i].valid = INVALID; + } + } else { + /* Set new tbl8 entry. */ + struct rte_lpm_tbl_entry new_tbl8_entry = { + .valid = VALID, + .depth = sub_rule_depth, + .valid_group = lpm->tbl8[tbl8_group_start].valid_group, + .next_hop = lpm->rules_tbl[sub_rule_index].next_hop, + }; + + /* + * Loop through the range of entries on tbl8 for which the + * rule_to_delete must be modified. + */ + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { + if (lpm->tbl8[i].depth <= depth) + lpm->tbl8[i] = new_tbl8_entry; + } + } + + /* + * Check if there are any valid entries in this tbl8 group. If all + * tbl8 entries are invalid we can free the tbl8 and invalidate the + * associated tbl24 entry. + */ + + tbl8_recycle_index = tbl8_recycle_check_v1604(lpm->tbl8, tbl8_group_start); + + if (tbl8_recycle_index == -EINVAL) { + /* Set tbl24 before freeing tbl8 to avoid race condition. */ + lpm->tbl24[tbl24_index].valid = 0; + tbl8_free_v1604(lpm->tbl8, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { + /* Update tbl24 entry. */ + struct rte_lpm_tbl_entry new_tbl24_entry = { + .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, + .valid = VALID, + .valid_group = 0, + .depth = lpm->tbl8[tbl8_recycle_index].depth, + }; + + /* Set tbl24 before freeing tbl8 to avoid race condition. */ + lpm->tbl24[tbl24_index] = new_tbl24_entry; + tbl8_free_v1604(lpm->tbl8, tbl8_group_start); + } +#undef group_idx + return 0; +} + +/* + * Deletes a rule + */ +int +rte_lpm_delete_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth) +{ + int32_t rule_to_delete_index, sub_rule_index; + uint32_t ip_masked; + uint8_t sub_rule_depth; + /* + * Check input arguments. Note: IP must be a positive integer of 32 + * bits in length therefore it need not be checked. + */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + return -EINVAL; + } + + ip_masked = ip & depth_to_mask(depth); + + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find_v20(lpm, ip_masked, depth); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -EINVAL. + */ + if (rule_to_delete_index < 0) + return -EINVAL; + + /* Delete the rule from the rule table. */ + rule_delete_v20(lpm, rule_to_delete_index, depth); + + /* + * Find rule to replace the rule_to_delete. If there is no rule to + * replace the rule_to_delete we return -1 and invalidate the table + * entries associated with this rule. + */ + sub_rule_depth = 0; + sub_rule_index = find_previous_rule_v20(lpm, ip, depth, &sub_rule_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <= MAX_DEPTH_TBL24) { + return delete_depth_small_v20(lpm, ip_masked, depth, + sub_rule_index, sub_rule_depth); + } else { /* If depth > MAX_DEPTH_TBL24 */ + return delete_depth_big_v20(lpm, ip_masked, depth, sub_rule_index, + sub_rule_depth); + } +} +VERSION_SYMBOL(rte_lpm_delete, _v20, 2.0); + +int +rte_lpm_delete_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth) +{ + int32_t rule_to_delete_index, sub_rule_index; + uint32_t ip_masked; + uint8_t sub_rule_depth; + /* + * Check input arguments. Note: IP must be a positive integer of 32 + * bits in length therefore it need not be checked. + */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + return -EINVAL; + } + + ip_masked = ip & depth_to_mask(depth); + + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find_v1604(lpm, ip_masked, depth); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -EINVAL. + */ + if (rule_to_delete_index < 0) + return -EINVAL; + + /* Delete the rule from the rule table. */ + rule_delete_v1604(lpm, rule_to_delete_index, depth); + + /* + * Find rule to replace the rule_to_delete. If there is no rule to + * replace the rule_to_delete we return -1 and invalidate the table + * entries associated with this rule. + */ + sub_rule_depth = 0; + sub_rule_index = find_previous_rule_v1604(lpm, ip, depth, &sub_rule_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <= MAX_DEPTH_TBL24) { + return delete_depth_small_v1604(lpm, ip_masked, depth, + sub_rule_index, sub_rule_depth); + } else { /* If depth > MAX_DEPTH_TBL24 */ + return delete_depth_big_v1604(lpm, ip_masked, depth, sub_rule_index, + sub_rule_depth); + } +} +BIND_DEFAULT_SYMBOL(rte_lpm_delete, _v1604, 16.04); +MAP_STATIC_SYMBOL(int rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, + uint8_t depth), rte_lpm_delete_v1604); + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm_delete_all_v20(struct rte_lpm_v20 *lpm) +{ + /* Zero rule information. */ + memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8)); + + /* Delete all rules form the rules table. */ + memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); +} +VERSION_SYMBOL(rte_lpm_delete_all, _v20, 2.0); + +void +rte_lpm_delete_all_v1604(struct rte_lpm *lpm) +{ + /* Zero rule information. */ + memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* Delete all rules form the rules table. */ + memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); +} +BIND_DEFAULT_SYMBOL(rte_lpm_delete_all, _v1604, 16.04); +MAP_STATIC_SYMBOL(void rte_lpm_delete_all(struct rte_lpm *lpm), + rte_lpm_delete_all_v1604); diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm.h b/src/spdk/dpdk/lib/librte_lpm/rte_lpm.h new file mode 100644 index 00000000..21550444 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm.h @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LPM_H_ +#define _RTE_LPM_H_ + +/** + * @file + * RTE Longest Prefix Match (LPM) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** Max number of characters in LPM name. */ +#define RTE_LPM_NAMESIZE 32 + +/** Maximum depth value possible for IPv4 LPM. */ +#define RTE_LPM_MAX_DEPTH 32 + +/** @internal Total number of tbl24 entries. */ +#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) + +/** @internal Number of entries in a tbl8 group. */ +#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 + +/** @internal Max number of tbl8 groups in the tbl8. */ +#define RTE_LPM_MAX_TBL8_NUM_GROUPS (1 << 24) + +/** @internal Total number of tbl8 groups in the tbl8. */ +#define RTE_LPM_TBL8_NUM_GROUPS 256 + +/** @internal Total number of tbl8 entries. */ +#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \ + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) + +/** @internal Macro to enable/disable run-time checks. */ +#if defined(RTE_LIBRTE_LPM_DEBUG) +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \ + if (cond) return (retval); \ +} while (0) +#else +#define RTE_LPM_RETURN_IF_TRUE(cond, retval) +#endif + +/** @internal bitmask with valid and valid_group fields set */ +#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x03000000 + +/** Bitmask used to indicate successful lookup */ +#define RTE_LPM_LOOKUP_SUCCESS 0x01000000 + +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN +/** @internal Tbl24 entry structure. */ +__extension__ +struct rte_lpm_tbl_entry_v20 { + /** + * Stores Next hop (tbl8 or tbl24 when valid_group is not set) or + * a group index pointing to a tbl8 structure (tbl24 only, when + * valid_group is set) + */ + RTE_STD_C11 + union { + uint8_t next_hop; + uint8_t group_idx; + }; + /* Using single uint8_t to store 3 values. */ + uint8_t valid :1; /**< Validation flag. */ + /** + * For tbl24: + * - valid_group == 0: entry stores a next hop + * - valid_group == 1: entry stores a group_index pointing to a tbl8 + * For tbl8: + * - valid_group indicates whether the current tbl8 is in use or not + */ + uint8_t valid_group :1; + uint8_t depth :6; /**< Rule depth. */ +}; + +__extension__ +struct rte_lpm_tbl_entry { + /** + * Stores Next hop (tbl8 or tbl24 when valid_group is not set) or + * a group index pointing to a tbl8 structure (tbl24 only, when + * valid_group is set) + */ + uint32_t next_hop :24; + /* Using single uint8_t to store 3 values. */ + uint32_t valid :1; /**< Validation flag. */ + /** + * For tbl24: + * - valid_group == 0: entry stores a next hop + * - valid_group == 1: entry stores a group_index pointing to a tbl8 + * For tbl8: + * - valid_group indicates whether the current tbl8 is in use or not + */ + uint32_t valid_group :1; + uint32_t depth :6; /**< Rule depth. */ +}; + +#else +__extension__ +struct rte_lpm_tbl_entry_v20 { + uint8_t depth :6; + uint8_t valid_group :1; + uint8_t valid :1; + union { + uint8_t group_idx; + uint8_t next_hop; + }; +}; + +__extension__ +struct rte_lpm_tbl_entry { + uint32_t depth :6; + uint32_t valid_group :1; + uint32_t valid :1; + uint32_t next_hop :24; + +}; + +#endif + +/** LPM configuration structure. */ +struct rte_lpm_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +/** @internal Rule structure. */ +struct rte_lpm_rule_v20 { + uint32_t ip; /**< Rule IP address. */ + uint8_t next_hop; /**< Rule next hop. */ +}; + +struct rte_lpm_rule { + uint32_t ip; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ +}; + +/** @internal Contains metadata about the rules table. */ +struct rte_lpm_rule_info { + uint32_t used_rules; /**< Used rules so far. */ + uint32_t first_rule; /**< Indexes the first rule of a given depth. */ +}; + +/** @internal LPM structure. */ +struct rte_lpm_v20 { + /* LPM metadata. */ + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max. balanced rules per lpm. */ + struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */ + + /* LPM Tables. */ + struct rte_lpm_tbl_entry_v20 tbl24[RTE_LPM_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + struct rte_lpm_tbl_entry_v20 tbl8[RTE_LPM_TBL8_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl8 table. */ + struct rte_lpm_rule_v20 rules_tbl[] + __rte_cache_aligned; /**< LPM rules. */ +}; + +struct rte_lpm { + /* LPM metadata. */ + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max. balanced rules per lpm. */ + uint32_t number_tbl8s; /**< Number of tbl8s. */ + struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */ + + /* LPM Tables. */ + struct rte_lpm_tbl_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + struct rte_lpm_tbl_entry *tbl8; /**< LPM tbl8 table. */ + struct rte_lpm_rule *rules_tbl; /**< LPM rules. */ +}; + +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm * +rte_lpm_create(const char *name, int socket_id, + const struct rte_lpm_config *config); +struct rte_lpm_v20 * +rte_lpm_create_v20(const char *name, int socket_id, int max_rules, int flags); +struct rte_lpm * +rte_lpm_create_v1604(const char *name, int socket_id, + const struct rte_lpm_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm * +rte_lpm_find_existing(const char *name); +struct rte_lpm_v20 * +rte_lpm_find_existing_v20(const char *name); +struct rte_lpm * +rte_lpm_find_existing_v1604(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm_free(struct rte_lpm *lpm); +void +rte_lpm_free_v20(struct rte_lpm_v20 *lpm); +void +rte_lpm_free_v1604(struct rte_lpm *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint32_t next_hop); +int +rte_lpm_add_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth, + uint8_t next_hop); +int +rte_lpm_add_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint32_t next_hop); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop); +int +rte_lpm_is_rule_present_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth, +uint8_t *next_hop); +int +rte_lpm_is_rule_present_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, +uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth); +int +rte_lpm_delete_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth); +int +rte_lpm_delete_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm_delete_all(struct rte_lpm *lpm); +void +rte_lpm_delete_all_v20(struct rte_lpm_v20 *lpm); +void +rte_lpm_delete_all_v1604(struct rte_lpm *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +static inline int +rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop) +{ + unsigned tbl24_index = (ip >> 8); + uint32_t tbl_entry; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (next_hop == NULL)), -EINVAL); + + /* Copy tbl24 entry */ + ptbl = (const uint32_t *)(&lpm->tbl24[tbl24_index]); + tbl_entry = *ptbl; + + /* Copy tbl8 entry (only if needed) */ + if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ip + + (((uint32_t)tbl_entry & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + tbl_entry = *ptbl; + } + + *next_hop = ((uint32_t)tbl_entry & 0x00FFFFFF); + return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT; +} + +/** + * Lookup multiple IP addresses in an LPM table. This may be implemented as a + * macro, so the address of the function should not be used. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The most significant byte in each + * value says whether the lookup was successful (bitmask + * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the + * actual next hop. + * @param n + * Number of elements in ips (and next_hops) array to lookup. This should be a + * compile time constant, and divisible by 8 for best performance. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ + rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n) + +static inline int +rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips, + uint32_t *next_hops, const unsigned n) +{ + unsigned i; + unsigned tbl24_indexes[n]; + const uint32_t *ptbl; + + /* DEBUG: Check user input arguments. */ + RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (ips == NULL) || + (next_hops == NULL)), -EINVAL); + + for (i = 0; i < n; i++) { + tbl24_indexes[i] = ips[i] >> 8; + } + + for (i = 0; i < n; i++) { + /* Simply copy tbl24 entry to output */ + ptbl = (const uint32_t *)&lpm->tbl24[tbl24_indexes[i]]; + next_hops[i] = *ptbl; + + /* Overwrite output with tbl8 entry if needed */ + if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + + unsigned tbl8_index = (uint8_t)ips[i] + + (((uint32_t)next_hops[i] & 0x00FFFFFF) * + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + + ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index]; + next_hops[i] = *ptbl; + } + } + return 0; +} + +/* Mask four results. */ +#define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff) + +/** + * Lookup four IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * Four IPs to be looked up in the LPM table + * @param hop + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an 4 elements array of two byte values. + * If the lookup was succesfull for the given IP, then least significant byte + * of the corresponding element is the actual next hop and the most + * significant byte is zero. + * If the lookup for the given IP failed, then corresponding element would + * contain default value, see description of then next parameter. + * @param defv + * Default value to populate into corresponding element of hop[] array, + * if lookup would fail. + */ +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv); + +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#include "rte_lpm_neon.h" +#elif defined(RTE_ARCH_PPC_64) +#include "rte_lpm_altivec.h" +#else +#include "rte_lpm_sse.h" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm6.c b/src/spdk/dpdk/lib/librte_lpm/rte_lpm6.c new file mode 100644 index 00000000..149677eb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm6.c @@ -0,0 +1,961 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_lpm6.h" + +#define RTE_LPM6_TBL24_NUM_ENTRIES (1 << 24) +#define RTE_LPM6_TBL8_GROUP_NUM_ENTRIES 256 +#define RTE_LPM6_TBL8_MAX_NUM_GROUPS (1 << 21) + +#define RTE_LPM6_VALID_EXT_ENTRY_BITMASK 0xA0000000 +#define RTE_LPM6_LOOKUP_SUCCESS 0x20000000 +#define RTE_LPM6_TBL8_BITMASK 0x001FFFFF + +#define ADD_FIRST_BYTE 3 +#define LOOKUP_FIRST_BYTE 4 +#define BYTE_SIZE 8 +#define BYTES2_SIZE 16 + +#define lpm6_tbl8_gindex next_hop + +/** Flags for setting an entry as valid/invalid. */ +enum valid_flag { + INVALID = 0, + VALID +}; + +TAILQ_HEAD(rte_lpm6_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_lpm6_tailq = { + .name = "RTE_LPM6", +}; +EAL_REGISTER_TAILQ(rte_lpm6_tailq) + +/** Tbl entry structure. It is the same for both tbl24 and tbl8 */ +struct rte_lpm6_tbl_entry { + uint32_t next_hop: 21; /**< Next hop / next table to be checked. */ + uint32_t depth :8; /**< Rule depth. */ + + /* Flags. */ + uint32_t valid :1; /**< Validation flag. */ + uint32_t valid_group :1; /**< Group validation flag. */ + uint32_t ext_entry :1; /**< External entry. */ +}; + +/** Rules tbl entry structure. */ +struct rte_lpm6_rule { + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ + uint32_t next_hop; /**< Rule next hop. */ + uint8_t depth; /**< Rule depth. */ +}; + +/** LPM6 structure. */ +struct rte_lpm6 { + /* LPM metadata. */ + char name[RTE_LPM6_NAMESIZE]; /**< Name of the lpm. */ + uint32_t max_rules; /**< Max number of rules. */ + uint32_t used_rules; /**< Used rules so far. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + uint32_t next_tbl8; /**< Next tbl8 to be used. */ + + /* LPM Tables. */ + struct rte_lpm6_rule *rules_tbl; /**< LPM rules. */ + struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] + __rte_cache_aligned; /**< LPM tbl24 table. */ + struct rte_lpm6_tbl_entry tbl8[0] + __rte_cache_aligned; /**< LPM tbl8 table. */ +}; + +/* + * Takes an array of uint8_t (IPv6 address) and masks it using the depth. + * It leaves untouched one bit per unit in the depth variable + * and set the rest to 0. + */ +static inline void +mask_ip(uint8_t *ip, uint8_t depth) +{ + int16_t part_depth, mask; + int i; + + part_depth = depth; + + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { + if (part_depth < BYTE_SIZE && part_depth >= 0) { + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); + ip[i] = (uint8_t)(ip[i] & mask); + } else if (part_depth < 0) { + ip[i] = 0; + } + part_depth -= BYTE_SIZE; + } +} + +/* + * Allocates memory for LPM object + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config) +{ + char mem_name[RTE_LPM6_NAMESIZE]; + struct rte_lpm6 *lpm = NULL; + struct rte_tailq_entry *te; + uint64_t mem_size, rules_size; + struct rte_lpm6_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm6_tbl_entry) != sizeof(uint32_t)); + + /* Check user arguments. */ + if ((name == NULL) || (socket_id < -1) || (config == NULL) || + (config->max_rules == 0) || + config->number_tbl8s > RTE_LPM6_TBL8_MAX_NUM_GROUPS) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); + + /* Determine the amount of memory to allocate. */ + mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); + rules_size = sizeof(struct rte_lpm6_rule) * config->max_rules; + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* Guarantee there's no existing */ + TAILQ_FOREACH(te, lpm_list, next) { + lpm = (struct rte_lpm6 *) te->data; + if (strncmp(name, lpm->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + lpm = NULL; + if (te != NULL) { + rte_errno = EEXIST; + goto exit; + } + + /* allocate tailq entry */ + te = rte_zmalloc("LPM6_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); + rte_errno = ENOMEM; + goto exit; + } + + /* Allocate memory to store the LPM data structures. */ + lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size, + RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm == NULL) { + RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); + rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + lpm->rules_tbl = rte_zmalloc_socket(NULL, + (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); + + if (lpm->rules_tbl == NULL) { + RTE_LOG(ERR, LPM, "LPM rules_tbl allocation failed\n"); + rte_free(lpm); + lpm = NULL; + rte_free(te); + rte_errno = ENOMEM; + goto exit; + } + + /* Save user arguments. */ + lpm->max_rules = config->max_rules; + lpm->number_tbl8s = config->number_tbl8s; + snprintf(lpm->name, sizeof(lpm->name), "%s", name); + + te->data = (void *) lpm; + + TAILQ_INSERT_TAIL(lpm_list, te, next); + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return lpm; +} + +/* + * Find an existing lpm table and return a pointer to it. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name) +{ + struct rte_lpm6 *l = NULL; + struct rte_tailq_entry *te; + struct rte_lpm6_list *lpm_list; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, lpm_list, next) { + l = (struct rte_lpm6 *) te->data; + if (strncmp(name, l->name, RTE_LPM6_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return l; +} + +/* + * Deallocates memory for given LPM table. + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm) +{ + struct rte_lpm6_list *lpm_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (lpm == NULL) + return; + + lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find our tailq entry */ + TAILQ_FOREACH(te, lpm_list, next) { + if (te->data == (void *) lpm) + break; + } + + if (te != NULL) + TAILQ_REMOVE(lpm_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(lpm->rules_tbl); + rte_free(lpm); + rte_free(te); +} + +/* + * Checks if a rule already exists in the rules table and updates + * the nexthop if so. Otherwise it adds a new rule if enough space is available. + */ +static inline int32_t +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) +{ + uint32_t rule_index; + + /* Scan through rule list to see if rule already exists. */ + for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { + + /* If rule already exists update its next_hop and return. */ + if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, + RTE_LPM6_IPV6_ADDR_SIZE) == 0) && + lpm->rules_tbl[rule_index].depth == depth) { + lpm->rules_tbl[rule_index].next_hop = next_hop; + + return rule_index; + } + } + + /* + * If rule does not exist check if there is space to add a new rule to + * this rule group. If there is no space return error. + */ + if (lpm->used_rules == lpm->max_rules) { + return -ENOSPC; + } + + /* If there is space for the new rule add it. */ + rte_memcpy(lpm->rules_tbl[rule_index].ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); + lpm->rules_tbl[rule_index].next_hop = next_hop; + lpm->rules_tbl[rule_index].depth = depth; + + /* Increment the used rules counter for this rule group. */ + lpm->used_rules++; + + return rule_index; +} + +/* + * Function that expands a rule across the data structure when a less-generic + * one has been added before. It assures that every possible combination of bits + * in the IP address returns a match. + */ +static void +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, + uint32_t next_hop) +{ + uint32_t tbl8_group_end, tbl8_gindex_next, j; + + tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + struct rte_lpm6_tbl_entry new_tbl8_entry = { + .valid = VALID, + .valid_group = VALID, + .depth = depth, + .next_hop = next_hop, + .ext_entry = 0, + }; + + for (j = tbl8_gindex; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 + && lpm->tbl8[j].depth <= depth)) { + + lpm->tbl8[j] = new_tbl8_entry; + + } else if (lpm->tbl8[j].ext_entry == 1) { + + tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex_next, depth, next_hop); + } + } +} + +/* + * Partially adds a new route to the data structure (tbl24+tbl8s). + * It returns 0 on success, a negative number on failure, or 1 if + * the process needs to be continued by calling the function again. + */ +static inline int +add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + struct rte_lpm6_tbl_entry **tbl_next, uint8_t *ip, uint8_t bytes, + uint8_t first_byte, uint8_t depth, uint32_t next_hop) +{ + uint32_t tbl_index, tbl_range, tbl8_group_start, tbl8_group_end, i; + int32_t tbl8_gindex; + int8_t bitshift; + uint8_t bits_covered; + + /* + * Calculate index to the table based on the number and position + * of the bytes being inspected in this step. + */ + tbl_index = 0; + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); + + if (bitshift < 0) bitshift = 0; + tbl_index = tbl_index | ip[i-1] << bitshift; + } + + /* Number of bits covered in this step */ + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); + + /* + * If depth if smaller than this number (ie this is the last step) + * expand the rule across the relevant positions in the table. + */ + if (depth <= bits_covered) { + tbl_range = 1 << (bits_covered - depth); + + for (i = tbl_index; i < (tbl_index + tbl_range); i++) { + if (!tbl[i].valid || (tbl[i].ext_entry == 0 && + tbl[i].depth <= depth)) { + + struct rte_lpm6_tbl_entry new_tbl_entry = { + .next_hop = next_hop, + .depth = depth, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 0, + }; + + tbl[i] = new_tbl_entry; + + } else if (tbl[i].ext_entry == 1) { + + /* + * If tbl entry is valid and extended calculate the index + * into next tbl8 and expand the rule across the data structure. + */ + tbl8_gindex = tbl[i].lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + expand_rule(lpm, tbl8_gindex, depth, next_hop); + } + } + + return 0; + } + /* + * If this is not the last step just fill one position + * and calculate the index to the next table. + */ + else { + /* If it's invalid a new tbl8 is needed */ + if (!tbl[tbl_index].valid) { + if (lpm->next_tbl8 < lpm->number_tbl8s) + tbl8_gindex = (lpm->next_tbl8)++; + else + return -ENOSPC; + + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[tbl_index] = new_tbl_entry; + } + /* + * If it's valid but not extended the rule that was stored * + * here needs to be moved to the next table. + */ + else if (tbl[tbl_index].ext_entry == 0) { + /* Search for free tbl8 group. */ + if (lpm->next_tbl8 < lpm->number_tbl8s) + tbl8_gindex = (lpm->next_tbl8)++; + else + return -ENOSPC; + + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_group_start + + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + + /* Populate new tbl8 with tbl value. */ + for (i = tbl8_group_start; i < tbl8_group_end; i++) { + lpm->tbl8[i].valid = VALID; + lpm->tbl8[i].depth = tbl[tbl_index].depth; + lpm->tbl8[i].next_hop = tbl[tbl_index].next_hop; + lpm->tbl8[i].ext_entry = 0; + } + + /* + * Update tbl entry to point to new tbl8 entry. Note: The + * ext_flag and tbl8_index need to be updated simultaneously, + * so assign whole structure in one go. + */ + struct rte_lpm6_tbl_entry new_tbl_entry = { + .lpm6_tbl8_gindex = tbl8_gindex, + .depth = 0, + .valid = VALID, + .valid_group = VALID, + .ext_entry = 1, + }; + + tbl[tbl_index] = new_tbl_entry; + } + + *tbl_next = &(lpm->tbl8[tbl[tbl_index].lpm6_tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); + } + + return 1; +} + +/* + * Add a route + */ +int +rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint8_t next_hop) +{ + return rte_lpm6_add_v1705(lpm, ip, depth, next_hop); +} +VERSION_SYMBOL(rte_lpm6_add, _v20, 2.0); + +int +rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t next_hop) +{ + struct rte_lpm6_tbl_entry *tbl; + struct rte_lpm6_tbl_entry *tbl_next = NULL; + int32_t rule_index; + int status; + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; + int i; + + /* Check user arguments. */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + memcpy(masked_ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); + mask_ip(masked_ip, depth); + + /* Add the rule to the rule table. */ + rule_index = rule_add(lpm, masked_ip, next_hop, depth); + + /* If there is no space available for new rule return error. */ + if (rule_index < 0) { + return rule_index; + } + + /* Inspect the first three bytes through tbl24 on the first step. */ + tbl = lpm->tbl24; + status = add_step (lpm, tbl, &tbl_next, masked_ip, ADD_FIRST_BYTE, 1, + depth, next_hop); + if (status < 0) { + rte_lpm6_delete(lpm, masked_ip, depth); + + return status; + } + + /* + * Inspect one by one the rest of the bytes until + * the process is completed. + */ + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { + tbl = tbl_next; + status = add_step (lpm, tbl, &tbl_next, masked_ip, 1, (uint8_t)(i+1), + depth, next_hop); + if (status < 0) { + rte_lpm6_delete(lpm, masked_ip, depth); + + return status; + } + } + + return status; +} +BIND_DEFAULT_SYMBOL(rte_lpm6_add, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_lpm6_add(struct rte_lpm6 *lpm, uint8_t *ip, + uint8_t depth, uint32_t next_hop), + rte_lpm6_add_v1705); + +/* + * Takes a pointer to a table entry and inspect one level. + * The function returns 0 on lookup success, ENOENT if no match was found + * or 1 if the process needs to be continued by calling the function again. + */ +static inline int +lookup_step(const struct rte_lpm6 *lpm, const struct rte_lpm6_tbl_entry *tbl, + const struct rte_lpm6_tbl_entry **tbl_next, uint8_t *ip, + uint8_t first_byte, uint32_t *next_hop) +{ + uint32_t tbl8_index, tbl_entry; + + /* Take the integer value from the pointer. */ + tbl_entry = *(const uint32_t *)tbl; + + /* If it is valid and extended we calculate the new pointer to return. */ + if ((tbl_entry & RTE_LPM6_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM6_VALID_EXT_ENTRY_BITMASK) { + + tbl8_index = ip[first_byte-1] + + ((tbl_entry & RTE_LPM6_TBL8_BITMASK) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); + + *tbl_next = &lpm->tbl8[tbl8_index]; + + return 1; + } else { + /* If not extended then we can have a match. */ + *next_hop = ((uint32_t)tbl_entry & RTE_LPM6_TBL8_BITMASK); + return (tbl_entry & RTE_LPM6_LOOKUP_SUCCESS) ? 0 : -ENOENT; + } +} + +/* + * Looks up an IP + */ +int +rte_lpm6_lookup_v20(const struct rte_lpm6 *lpm, uint8_t *ip, uint8_t *next_hop) +{ + uint32_t next_hop32 = 0; + int32_t status; + + /* DEBUG: Check user input arguments. */ + if (next_hop == NULL) + return -EINVAL; + + status = rte_lpm6_lookup_v1705(lpm, ip, &next_hop32); + if (status == 0) + *next_hop = (uint8_t)next_hop32; + + return status; +} +VERSION_SYMBOL(rte_lpm6_lookup, _v20, 2.0); + +int +rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip, + uint32_t *next_hop) +{ + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + int status; + uint8_t first_byte; + uint32_t tbl24_index; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) { + return -EINVAL; + } + + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels until success or failure */ + status = lookup_step(lpm, tbl, &tbl_next, ip, first_byte++, next_hop); + tbl = tbl_next; + } while (status == 1); + + return status; +} +BIND_DEFAULT_SYMBOL(rte_lpm6_lookup, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_lpm6_lookup(const struct rte_lpm6 *lpm, uint8_t *ip, + uint32_t *next_hop), rte_lpm6_lookup_v1705); + +/* + * Looks up a group of IP addresses + */ +int +rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int16_t * next_hops, unsigned n) +{ + unsigned i; + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + uint32_t tbl24_index, next_hop; + uint8_t first_byte; + int status; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) { + return -EINVAL; + } + + for (i = 0; i < n; i++) { + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ips[i][0] << BYTES2_SIZE) | + (ips[i][1] << BYTE_SIZE) | ips[i][2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels until success or failure */ + status = lookup_step(lpm, tbl, &tbl_next, ips[i], first_byte++, + &next_hop); + tbl = tbl_next; + } while (status == 1); + + if (status < 0) + next_hops[i] = -1; + else + next_hops[i] = (int16_t)next_hop; + } + + return 0; +} +VERSION_SYMBOL(rte_lpm6_lookup_bulk_func, _v20, 2.0); + +int +rte_lpm6_lookup_bulk_func_v1705(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n) +{ + unsigned int i; + const struct rte_lpm6_tbl_entry *tbl; + const struct rte_lpm6_tbl_entry *tbl_next = NULL; + uint32_t tbl24_index, next_hop; + uint8_t first_byte; + int status; + + /* DEBUG: Check user input arguments. */ + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) + return -EINVAL; + + for (i = 0; i < n; i++) { + first_byte = LOOKUP_FIRST_BYTE; + tbl24_index = (ips[i][0] << BYTES2_SIZE) | + (ips[i][1] << BYTE_SIZE) | ips[i][2]; + + /* Calculate pointer to the first entry to be inspected */ + tbl = &lpm->tbl24[tbl24_index]; + + do { + /* Continue inspecting following levels + * until success or failure + */ + status = lookup_step(lpm, tbl, &tbl_next, ips[i], + first_byte++, &next_hop); + tbl = tbl_next; + } while (status == 1); + + if (status < 0) + next_hops[i] = -1; + else + next_hops[i] = (int32_t)next_hop; + } + + return 0; +} +BIND_DEFAULT_SYMBOL(rte_lpm6_lookup_bulk_func, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n), + rte_lpm6_lookup_bulk_func_v1705); + +/* + * Finds a rule in rule table. + * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. + */ +static inline int32_t +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +{ + uint32_t rule_index; + + /* Scan used rules at given depth to find rule. */ + for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { + /* If rule is found return the rule index. */ + if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, + RTE_LPM6_IPV6_ADDR_SIZE) == 0) && + lpm->rules_tbl[rule_index].depth == depth) { + + return rule_index; + } + } + + /* If rule is not found return -ENOENT. */ + return -ENOENT; +} + +/* + * Look for a rule in the high-level rules table + */ +int +rte_lpm6_is_rule_present_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint8_t *next_hop) +{ + uint32_t next_hop32 = 0; + int32_t status; + + /* DEBUG: Check user input arguments. */ + if (next_hop == NULL) + return -EINVAL; + + status = rte_lpm6_is_rule_present_v1705(lpm, ip, depth, &next_hop32); + if (status > 0) + *next_hop = (uint8_t)next_hop32; + + return status; + +} +VERSION_SYMBOL(rte_lpm6_is_rule_present, _v20, 2.0); + +int +rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop) +{ + uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; + int32_t rule_index; + + /* Check user arguments. */ + if ((lpm == NULL) || next_hop == NULL || ip == NULL || + (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) + return -EINVAL; + + /* Copy the IP and mask it to avoid modifying user's input data. */ + memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE); + mask_ip(ip_masked, depth); + + /* Look for the rule using rule_find. */ + rule_index = rule_find(lpm, ip_masked, depth); + + if (rule_index >= 0) { + *next_hop = lpm->rules_tbl[rule_index].next_hop; + return 1; + } + + /* If rule is not found return 0. */ + return 0; +} +BIND_DEFAULT_SYMBOL(rte_lpm6_is_rule_present, _v1705, 17.05); +MAP_STATIC_SYMBOL(int rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, + uint8_t *ip, uint8_t depth, uint32_t *next_hop), + rte_lpm6_is_rule_present_v1705); + +/* + * Delete a rule from the rule table. + * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. + */ +static inline void +rule_delete(struct rte_lpm6 *lpm, int32_t rule_index) +{ + /* + * Overwrite redundant rule with last rule in group and decrement rule + * counter. + */ + lpm->rules_tbl[rule_index] = lpm->rules_tbl[lpm->used_rules-1]; + lpm->used_rules--; +} + +/* + * Deletes a rule + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) +{ + int32_t rule_to_delete_index; + uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; + unsigned i; + + /* + * Check input arguments. + */ + if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) { + return -EINVAL; + } + + /* Copy the IP and mask it to avoid modifying user's input data. */ + memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE); + mask_ip(ip_masked, depth); + + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find(lpm, ip_masked, depth); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -ENOENT. + */ + if (rule_to_delete_index < 0) + return rule_to_delete_index; + + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule_to_delete_index); + + /* + * Set all the table entries to 0 (ie delete every rule + * from the data structure. + */ + lpm->next_tbl8 = 0; + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* + * Add every rule again (except for the one that was removed from + * the rules table). + */ + for (i = 0; i < lpm->used_rules; i++) { + rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth, + lpm->rules_tbl[i].next_hop); + } + + return 0; +} + +/* + * Deletes a group of rules + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n) +{ + int32_t rule_to_delete_index; + uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; + unsigned i; + + /* + * Check input arguments. + */ + if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) { + return -EINVAL; + } + + for (i = 0; i < n; i++) { + /* Copy the IP and mask it to avoid modifying user's input data. */ + memcpy(ip_masked, ips[i], RTE_LPM6_IPV6_ADDR_SIZE); + mask_ip(ip_masked, depths[i]); + + /* + * Find the index of the input rule, that needs to be deleted, in the + * rule table. + */ + rule_to_delete_index = rule_find(lpm, ip_masked, depths[i]); + + /* + * Check if rule_to_delete_index was found. If no rule was found the + * function rule_find returns -ENOENT. + */ + if (rule_to_delete_index < 0) + continue; + + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule_to_delete_index); + } + + /* + * Set all the table entries to 0 (ie delete every rule + * from the data structure. + */ + lpm->next_tbl8 = 0; + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) + * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* + * Add every rule again (except for the ones that were removed from + * the rules table). + */ + for (i = 0; i < lpm->used_rules; i++) { + rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth, + lpm->rules_tbl[i].next_hop); + } + + return 0; +} + +/* + * Delete all rules from the LPM table. + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm) +{ + /* Zero used rules counter. */ + lpm->used_rules = 0; + + /* Zero next tbl8 index. */ + lpm->next_tbl8 = 0; + + /* Zero tbl24. */ + memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); + + /* Zero tbl8. */ + memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s); + + /* Delete all rules form the rules table. */ + memset(lpm->rules_tbl, 0, sizeof(struct rte_lpm6_rule) * lpm->max_rules); +} diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm6.h b/src/spdk/dpdk/lib/librte_lpm/rte_lpm6.h new file mode 100644 index 00000000..5d59ccb1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm6.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#ifndef _RTE_LPM6_H_ +#define _RTE_LPM6_H_ + +/** + * @file + * RTE Longest Prefix Match for IPv6 (LPM6) + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RTE_LPM6_MAX_DEPTH 128 +#define RTE_LPM6_IPV6_ADDR_SIZE 16 +/** Max number of characters in LPM name. */ +#define RTE_LPM6_NAMESIZE 32 + +/** LPM structure. */ +struct rte_lpm6; + +/** LPM configuration structure. */ +struct rte_lpm6_config { + uint32_t max_rules; /**< Max number of rules. */ + uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ + int flags; /**< This field is currently unused. */ +}; + +/** + * Create an LPM object. + * + * @param name + * LPM object name + * @param socket_id + * NUMA socket ID for LPM table memory allocation + * @param config + * Structure containing the configuration + * @return + * Handle to LPM object on success, NULL otherwise with rte_errno set + * to an appropriate values. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - invalid parameter passed to function + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_lpm6 * +rte_lpm6_create(const char *name, int socket_id, + const struct rte_lpm6_config *config); + +/** + * Find an existing LPM object and return a pointer to it. + * + * @param name + * Name of the lpm object as passed to rte_lpm6_create() + * @return + * Pointer to lpm object or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_lpm6 * +rte_lpm6_find_existing(const char *name); + +/** + * Free an LPM object. + * + * @param lpm + * LPM object handle + * @return + * None + */ +void +rte_lpm6_free(struct rte_lpm6 *lpm); + +/** + * Add a rule to the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be added to the LPM table + * @param depth + * Depth of the rule to be added to the LPM table + * @param next_hop + * Next hop of the rule to be added to the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t next_hop); +int +rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint8_t next_hop); +int +rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t next_hop); + +/** + * Check if a rule is present in the LPM table, + * and provide its next hop if it is. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be searched + * @param depth + * Depth of the rule to searched + * @param next_hop + * Next hop of the rule (valid only if it is found) + * @return + * 1 if the rule exists, 0 if it does not, a negative value on failure + */ +int +rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop); +int +rte_lpm6_is_rule_present_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint8_t *next_hop); +int +rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, + uint32_t *next_hop); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP of the rule to be deleted from the LPM table + * @param depth + * Depth of the rule to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise + */ +int +rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth); + +/** + * Delete a rule from the LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be deleted from the LPM table + * @param depths + * Array of depths of the rules to be deleted from the LPM table + * @param n + * Number of rules to be deleted from the LPM table + * @return + * 0 on success, negative value otherwise. + */ +int +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n); + +/** + * Delete all rules from the LPM table. + * + * @param lpm + * LPM object handle + */ +void +rte_lpm6_delete_all(struct rte_lpm6 *lpm); + +/** + * Lookup an IP into the LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * IP to be looked up in the LPM table + * @param next_hop + * Next hop of the most specific rule found for IP (valid on lookup hit only) + * @return + * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit + */ +int +rte_lpm6_lookup(const struct rte_lpm6 *lpm, uint8_t *ip, uint32_t *next_hop); +int +rte_lpm6_lookup_v20(const struct rte_lpm6 *lpm, uint8_t *ip, uint8_t *next_hop); +int +rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip, + uint32_t *next_hop); + +/** + * Lookup multiple IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ips + * Array of IPs to be looked up in the LPM table + * @param next_hops + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an array of two byte values. The next hop will be stored on + * each position on success; otherwise the position will be set to -1. + * @param n + * Number of elements in ips (and next_hops) array to lookup. + * @return + * -EINVAL for incorrect arguments, otherwise 0 + */ +int +rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n); +int +rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int16_t *next_hops, unsigned int n); +int +rte_lpm6_lookup_bulk_func_v1705(const struct rte_lpm6 *lpm, + uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], + int32_t *next_hops, unsigned int n); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm_altivec.h b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_altivec.h new file mode 100644 index 00000000..e26e0875 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_altivec.h @@ -0,0 +1,154 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_LPM_ALTIVEC_H_ +#define _RTE_LPM_ALTIVEC_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv) +{ + vector signed int i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t idx, pt, pt2; + const uint32_t *ptbl; + + const uint32_t mask = UINT8_MAX; + const vector signed int mask8 = (xmm_t){mask, mask, mask, mask}; + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24 = vec_sr((xmm_t) ip, + (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT}); + + /* extract values from tbl24[] */ + idx = (uint32_t)i24[0]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[0] = *ptbl; + + idx = (uint32_t) i24[1]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[1] = *ptbl; + + idx = (uint32_t) i24[2]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[2] = *ptbl; + + idx = (uint32_t) i24[3]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = vec_and(ip, mask8); + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_ALTIVEC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm_neon.h b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_neon.h new file mode 100644 index 00000000..4fd33f33 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_neon.h @@ -0,0 +1,154 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2015 Cavium, Inc. All rights reserved. + * All rights reserved. + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Derived rte_lpm_lookupx4 implementation from lib/librte_lpm/rte_lpm_sse.h + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium, Inc nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_LPM_NEON_H_ +#define _RTE_LPM_NEON_H_ + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv) +{ + uint32x4_t i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t idx, pt, pt2; + const uint32_t *ptbl; + + const uint32_t mask = UINT8_MAX; + const int32x4_t mask8 = vdupq_n_s32(mask); + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT); + + /* extract values from tbl24[] */ + idx = vgetq_lane_u64((uint64x2_t)i24, 0); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[0] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[1] = *ptbl; + + idx = vgetq_lane_u64((uint64x2_t)i24, 1); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[2] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = vandq_s32(ip, mask8); + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_NEON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm_sse.h b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_sse.h new file mode 100644 index 00000000..3b27e97e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_sse.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_LPM_SSE_H_ +#define _RTE_LPM_SSE_H_ + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv) +{ + __m128i i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t idx, pt, pt2; + const uint32_t *ptbl; + + const __m128i mask8 = + _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX); + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24 = _mm_srli_epi32(ip, CHAR_BIT); + + /* extract values from tbl24[] */ + idx = _mm_cvtsi128_si64(i24); + /* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a constant */ + i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[0] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[1] = *ptbl; + + idx = _mm_cvtsi128_si64(i24); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[2] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = _mm_and_si128(ip, mask8); + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_SSE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_lpm/rte_lpm_version.map b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_version.map new file mode 100644 index 00000000..90beac85 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_lpm/rte_lpm_version.map @@ -0,0 +1,46 @@ +DPDK_2.0 { + global: + + rte_lpm_add; + rte_lpm_create; + rte_lpm_delete; + rte_lpm_delete_all; + rte_lpm_find_existing; + rte_lpm_free; + rte_lpm_is_rule_present; + rte_lpm6_add; + rte_lpm6_create; + rte_lpm6_delete; + rte_lpm6_delete_all; + rte_lpm6_delete_bulk_func; + rte_lpm6_find_existing; + rte_lpm6_free; + rte_lpm6_is_rule_present; + rte_lpm6_lookup; + rte_lpm6_lookup_bulk_func; + + local: *; +}; + +DPDK_16.04 { + global: + + rte_lpm_add; + rte_lpm_find_existing; + rte_lpm_create; + rte_lpm_free; + rte_lpm_is_rule_present; + rte_lpm_delete; + rte_lpm_delete_all; + +} DPDK_2.0; + +DPDK_17.05 { + global: + + rte_lpm6_add; + rte_lpm6_is_rule_present; + rte_lpm6_lookup; + rte_lpm6_lookup_bulk_func; + +} DPDK_16.04; diff --git a/src/spdk/dpdk/lib/librte_mbuf/Makefile b/src/spdk/dpdk/lib/librte_mbuf/Makefile new file mode 100644 index 00000000..e2b98a25 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_mbuf.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_eal -lrte_mempool + +EXPORT_MAP := rte_mbuf_version.map + +LIBABIVER := 4 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_MBUF) := rte_mbuf.c rte_mbuf_ptype.c rte_mbuf_pool_ops.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_MBUF)-include := rte_mbuf.h rte_mbuf_ptype.h rte_mbuf_pool_ops.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_mbuf/meson.build b/src/spdk/dpdk/lib/librte_mbuf/meson.build new file mode 100644 index 00000000..45ffb0db --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 3 +sources = files('rte_mbuf.c', 'rte_mbuf_ptype.c', 'rte_mbuf_pool_ops.c') +headers = files('rte_mbuf.h', 'rte_mbuf_ptype.h', 'rte_mbuf_pool_ops.h') +deps += ['mempool'] diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.c b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.c new file mode 100644 index 00000000..e714c5a5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.c @@ -0,0 +1,463 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright 2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * pktmbuf pool constructor, given as a callback function to + * rte_mempool_create(), or called directly if using + * rte_mempool_create_empty()/rte_mempool_populate() + */ +void +rte_pktmbuf_pool_init(struct rte_mempool *mp, void *opaque_arg) +{ + struct rte_pktmbuf_pool_private *user_mbp_priv, *mbp_priv; + struct rte_pktmbuf_pool_private default_mbp_priv; + uint16_t roomsz; + + RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf)); + + /* if no structure is provided, assume no mbuf private area */ + user_mbp_priv = opaque_arg; + if (user_mbp_priv == NULL) { + default_mbp_priv.mbuf_priv_size = 0; + if (mp->elt_size > sizeof(struct rte_mbuf)) + roomsz = mp->elt_size - sizeof(struct rte_mbuf); + else + roomsz = 0; + default_mbp_priv.mbuf_data_room_size = roomsz; + user_mbp_priv = &default_mbp_priv; + } + + RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) + + user_mbp_priv->mbuf_data_room_size + + user_mbp_priv->mbuf_priv_size); + + mbp_priv = rte_mempool_get_priv(mp); + memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv)); +} + +/* + * pktmbuf constructor, given as a callback function to + * rte_mempool_obj_iter() or rte_mempool_create(). + * Set the fields of a packet mbuf to their default values. + */ +void +rte_pktmbuf_init(struct rte_mempool *mp, + __attribute__((unused)) void *opaque_arg, + void *_m, + __attribute__((unused)) unsigned i) +{ + struct rte_mbuf *m = _m; + uint32_t mbuf_size, buf_len, priv_size; + + priv_size = rte_pktmbuf_priv_size(mp); + mbuf_size = sizeof(struct rte_mbuf) + priv_size; + buf_len = rte_pktmbuf_data_room_size(mp); + + RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == priv_size); + RTE_ASSERT(mp->elt_size >= mbuf_size); + RTE_ASSERT(buf_len <= UINT16_MAX); + + memset(m, 0, mbuf_size); + /* start of buffer is after mbuf structure and priv data */ + m->priv_size = priv_size; + m->buf_addr = (char *)m + mbuf_size; + m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size; + m->buf_len = (uint16_t)buf_len; + + /* keep some headroom between start of buffer and data */ + m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len); + + /* init some constant fields */ + m->pool = mp; + m->nb_segs = 1; + m->port = MBUF_INVALID_PORT; + rte_mbuf_refcnt_set(m, 1); + m->next = NULL; +} + +/* Helper to create a mbuf pool with given mempool ops name*/ +struct rte_mempool * +rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n, + unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, + int socket_id, const char *ops_name) +{ + struct rte_mempool *mp; + struct rte_pktmbuf_pool_private mbp_priv; + const char *mp_ops_name = ops_name; + unsigned elt_size; + int ret; + + if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) { + RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n", + priv_size); + rte_errno = EINVAL; + return NULL; + } + elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size + + (unsigned)data_room_size; + mbp_priv.mbuf_data_room_size = data_room_size; + mbp_priv.mbuf_priv_size = priv_size; + + mp = rte_mempool_create_empty(name, n, elt_size, cache_size, + sizeof(struct rte_pktmbuf_pool_private), socket_id, 0); + if (mp == NULL) + return NULL; + + if (mp_ops_name == NULL) + mp_ops_name = rte_mbuf_best_mempool_ops(); + ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL); + if (ret != 0) { + RTE_LOG(ERR, MBUF, "error setting mempool handler\n"); + rte_mempool_free(mp); + rte_errno = -ret; + return NULL; + } + rte_pktmbuf_pool_init(mp, &mbp_priv); + + ret = rte_mempool_populate_default(mp); + if (ret < 0) { + rte_mempool_free(mp); + rte_errno = -ret; + return NULL; + } + + rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL); + + return mp; +} + +/* helper to create a mbuf pool */ +struct rte_mempool * +rte_pktmbuf_pool_create(const char *name, unsigned int n, + unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, + int socket_id) +{ + return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size, + data_room_size, socket_id, NULL); +} + +/* do some sanity checks on a mbuf: panic if it fails */ +void +rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) +{ + unsigned int nb_segs, pkt_len; + + if (m == NULL) + rte_panic("mbuf is NULL\n"); + + /* generic checks */ + if (m->pool == NULL) + rte_panic("bad mbuf pool\n"); + if (m->buf_iova == 0) + rte_panic("bad IO addr\n"); + if (m->buf_addr == NULL) + rte_panic("bad virt addr\n"); + + uint16_t cnt = rte_mbuf_refcnt_read(m); + if ((cnt == 0) || (cnt == UINT16_MAX)) + rte_panic("bad ref cnt\n"); + + /* nothing to check for sub-segments */ + if (is_header == 0) + return; + + /* data_len is supposed to be not more than pkt_len */ + if (m->data_len > m->pkt_len) + rte_panic("bad data_len\n"); + + nb_segs = m->nb_segs; + pkt_len = m->pkt_len; + + do { + nb_segs -= 1; + pkt_len -= m->data_len; + } while ((m = m->next) != NULL); + + if (nb_segs) + rte_panic("bad nb_segs\n"); + if (pkt_len) + rte_panic("bad pkt_len\n"); +} + +/* dump a mbuf on console */ +void +rte_pktmbuf_dump(FILE *f, const struct rte_mbuf *m, unsigned dump_len) +{ + unsigned int len; + unsigned int nb_segs; + + __rte_mbuf_sanity_check(m, 1); + + fprintf(f, "dump mbuf at %p, iova=%"PRIx64", buf_len=%u\n", + m, (uint64_t)m->buf_iova, (unsigned)m->buf_len); + fprintf(f, " pkt_len=%"PRIu32", ol_flags=%"PRIx64", nb_segs=%u, " + "in_port=%u\n", m->pkt_len, m->ol_flags, + (unsigned)m->nb_segs, (unsigned)m->port); + nb_segs = m->nb_segs; + + while (m && nb_segs != 0) { + __rte_mbuf_sanity_check(m, 0); + + fprintf(f, " segment at %p, data=%p, data_len=%u\n", + m, rte_pktmbuf_mtod(m, void *), (unsigned)m->data_len); + len = dump_len; + if (len > m->data_len) + len = m->data_len; + if (len != 0) + rte_hexdump(f, NULL, rte_pktmbuf_mtod(m, void *), len); + dump_len -= len; + m = m->next; + nb_segs --; + } +} + +/* read len data bytes in a mbuf at specified offset (internal) */ +const void *__rte_pktmbuf_read(const struct rte_mbuf *m, uint32_t off, + uint32_t len, void *buf) +{ + const struct rte_mbuf *seg = m; + uint32_t buf_off = 0, copy_len; + + if (off + len > rte_pktmbuf_pkt_len(m)) + return NULL; + + while (off >= rte_pktmbuf_data_len(seg)) { + off -= rte_pktmbuf_data_len(seg); + seg = seg->next; + } + + if (off + len <= rte_pktmbuf_data_len(seg)) + return rte_pktmbuf_mtod_offset(seg, char *, off); + + /* rare case: header is split among several segments */ + while (len > 0) { + copy_len = rte_pktmbuf_data_len(seg) - off; + if (copy_len > len) + copy_len = len; + rte_memcpy((char *)buf + buf_off, + rte_pktmbuf_mtod_offset(seg, char *, off), copy_len); + off = 0; + buf_off += copy_len; + len -= copy_len; + seg = seg->next; + } + + return buf; +} + +/* + * Get the name of a RX offload flag. Must be kept synchronized with flag + * definitions in rte_mbuf.h. + */ +const char *rte_get_rx_ol_flag_name(uint64_t mask) +{ + switch (mask) { + case PKT_RX_VLAN: return "PKT_RX_VLAN"; + case PKT_RX_RSS_HASH: return "PKT_RX_RSS_HASH"; + case PKT_RX_FDIR: return "PKT_RX_FDIR"; + case PKT_RX_L4_CKSUM_BAD: return "PKT_RX_L4_CKSUM_BAD"; + case PKT_RX_L4_CKSUM_GOOD: return "PKT_RX_L4_CKSUM_GOOD"; + case PKT_RX_L4_CKSUM_NONE: return "PKT_RX_L4_CKSUM_NONE"; + case PKT_RX_IP_CKSUM_BAD: return "PKT_RX_IP_CKSUM_BAD"; + case PKT_RX_IP_CKSUM_GOOD: return "PKT_RX_IP_CKSUM_GOOD"; + case PKT_RX_IP_CKSUM_NONE: return "PKT_RX_IP_CKSUM_NONE"; + case PKT_RX_EIP_CKSUM_BAD: return "PKT_RX_EIP_CKSUM_BAD"; + case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED"; + case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP"; + case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST"; + case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED"; + case PKT_RX_LRO: return "PKT_RX_LRO"; + case PKT_RX_TIMESTAMP: return "PKT_RX_TIMESTAMP"; + case PKT_RX_SEC_OFFLOAD: return "PKT_RX_SEC_OFFLOAD"; + case PKT_RX_SEC_OFFLOAD_FAILED: return "PKT_RX_SEC_OFFLOAD_FAILED"; + default: return NULL; + } +} + +struct flag_mask { + uint64_t flag; + uint64_t mask; + const char *default_name; +}; + +/* write the list of rx ol flags in buffer buf */ +int +rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen) +{ + const struct flag_mask rx_flags[] = { + { PKT_RX_VLAN, PKT_RX_VLAN, NULL }, + { PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, NULL }, + { PKT_RX_FDIR, PKT_RX_FDIR, NULL }, + { PKT_RX_L4_CKSUM_BAD, PKT_RX_L4_CKSUM_MASK, NULL }, + { PKT_RX_L4_CKSUM_GOOD, PKT_RX_L4_CKSUM_MASK, NULL }, + { PKT_RX_L4_CKSUM_NONE, PKT_RX_L4_CKSUM_MASK, NULL }, + { PKT_RX_L4_CKSUM_UNKNOWN, PKT_RX_L4_CKSUM_MASK, + "PKT_RX_L4_CKSUM_UNKNOWN" }, + { PKT_RX_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_MASK, NULL }, + { PKT_RX_IP_CKSUM_GOOD, PKT_RX_IP_CKSUM_MASK, NULL }, + { PKT_RX_IP_CKSUM_NONE, PKT_RX_IP_CKSUM_MASK, NULL }, + { PKT_RX_IP_CKSUM_UNKNOWN, PKT_RX_IP_CKSUM_MASK, + "PKT_RX_IP_CKSUM_UNKNOWN" }, + { PKT_RX_EIP_CKSUM_BAD, PKT_RX_EIP_CKSUM_BAD, NULL }, + { PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN_STRIPPED, NULL }, + { PKT_RX_IEEE1588_PTP, PKT_RX_IEEE1588_PTP, NULL }, + { PKT_RX_IEEE1588_TMST, PKT_RX_IEEE1588_TMST, NULL }, + { PKT_RX_QINQ_STRIPPED, PKT_RX_QINQ_STRIPPED, NULL }, + { PKT_RX_LRO, PKT_RX_LRO, NULL }, + { PKT_RX_TIMESTAMP, PKT_RX_TIMESTAMP, NULL }, + { PKT_RX_SEC_OFFLOAD, PKT_RX_SEC_OFFLOAD, NULL }, + { PKT_RX_SEC_OFFLOAD_FAILED, PKT_RX_SEC_OFFLOAD_FAILED, NULL }, + { PKT_RX_QINQ, PKT_RX_QINQ, NULL }, + }; + const char *name; + unsigned int i; + int ret; + + if (buflen == 0) + return -1; + + buf[0] = '\0'; + for (i = 0; i < RTE_DIM(rx_flags); i++) { + if ((mask & rx_flags[i].mask) != rx_flags[i].flag) + continue; + name = rte_get_rx_ol_flag_name(rx_flags[i].flag); + if (name == NULL) + name = rx_flags[i].default_name; + ret = snprintf(buf, buflen, "%s ", name); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + + return 0; +} + +/* + * Get the name of a TX offload flag. Must be kept synchronized with flag + * definitions in rte_mbuf.h. + */ +const char *rte_get_tx_ol_flag_name(uint64_t mask) +{ + switch (mask) { + case PKT_TX_VLAN_PKT: return "PKT_TX_VLAN_PKT"; + case PKT_TX_IP_CKSUM: return "PKT_TX_IP_CKSUM"; + case PKT_TX_TCP_CKSUM: return "PKT_TX_TCP_CKSUM"; + case PKT_TX_SCTP_CKSUM: return "PKT_TX_SCTP_CKSUM"; + case PKT_TX_UDP_CKSUM: return "PKT_TX_UDP_CKSUM"; + case PKT_TX_IEEE1588_TMST: return "PKT_TX_IEEE1588_TMST"; + case PKT_TX_TCP_SEG: return "PKT_TX_TCP_SEG"; + case PKT_TX_IPV4: return "PKT_TX_IPV4"; + case PKT_TX_IPV6: return "PKT_TX_IPV6"; + case PKT_TX_OUTER_IP_CKSUM: return "PKT_TX_OUTER_IP_CKSUM"; + case PKT_TX_OUTER_IPV4: return "PKT_TX_OUTER_IPV4"; + case PKT_TX_OUTER_IPV6: return "PKT_TX_OUTER_IPV6"; + case PKT_TX_TUNNEL_VXLAN: return "PKT_TX_TUNNEL_VXLAN"; + case PKT_TX_TUNNEL_GRE: return "PKT_TX_TUNNEL_GRE"; + case PKT_TX_TUNNEL_IPIP: return "PKT_TX_TUNNEL_IPIP"; + case PKT_TX_TUNNEL_GENEVE: return "PKT_TX_TUNNEL_GENEVE"; + case PKT_TX_TUNNEL_MPLSINUDP: return "PKT_TX_TUNNEL_MPLSINUDP"; + case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE"; + case PKT_TX_TUNNEL_IP: return "PKT_TX_TUNNEL_IP"; + case PKT_TX_TUNNEL_UDP: return "PKT_TX_TUNNEL_UDP"; + case PKT_TX_MACSEC: return "PKT_TX_MACSEC"; + case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD"; + default: return NULL; + } +} + +/* write the list of tx ol flags in buffer buf */ +int +rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen) +{ + const struct flag_mask tx_flags[] = { + { PKT_TX_VLAN_PKT, PKT_TX_VLAN_PKT, NULL }, + { PKT_TX_IP_CKSUM, PKT_TX_IP_CKSUM, NULL }, + { PKT_TX_TCP_CKSUM, PKT_TX_L4_MASK, NULL }, + { PKT_TX_SCTP_CKSUM, PKT_TX_L4_MASK, NULL }, + { PKT_TX_UDP_CKSUM, PKT_TX_L4_MASK, NULL }, + { PKT_TX_L4_NO_CKSUM, PKT_TX_L4_MASK, "PKT_TX_L4_NO_CKSUM" }, + { PKT_TX_IEEE1588_TMST, PKT_TX_IEEE1588_TMST, NULL }, + { PKT_TX_TCP_SEG, PKT_TX_TCP_SEG, NULL }, + { PKT_TX_IPV4, PKT_TX_IPV4, NULL }, + { PKT_TX_IPV6, PKT_TX_IPV6, NULL }, + { PKT_TX_OUTER_IP_CKSUM, PKT_TX_OUTER_IP_CKSUM, NULL }, + { PKT_TX_OUTER_IPV4, PKT_TX_OUTER_IPV4, NULL }, + { PKT_TX_OUTER_IPV6, PKT_TX_OUTER_IPV6, NULL }, + { PKT_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_GRE, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_IPIP, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_GENEVE, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_IP, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_TUNNEL_UDP, PKT_TX_TUNNEL_MASK, + "PKT_TX_TUNNEL_NONE" }, + { PKT_TX_MACSEC, PKT_TX_MACSEC, NULL }, + { PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL }, + }; + const char *name; + unsigned int i; + int ret; + + if (buflen == 0) + return -1; + + buf[0] = '\0'; + for (i = 0; i < RTE_DIM(tx_flags); i++) { + if ((mask & tx_flags[i].mask) != tx_flags[i].flag) + continue; + name = rte_get_tx_ol_flag_name(tx_flags[i].flag); + if (name == NULL) + name = tx_flags[i].default_name; + ret = snprintf(buf, buflen, "%s ", name); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.h b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.h new file mode 100644 index 00000000..9ce5d76d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf.h @@ -0,0 +1,2248 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright 2014 6WIND S.A. + */ + +#ifndef _RTE_MBUF_H_ +#define _RTE_MBUF_H_ + +/** + * @file + * RTE Mbuf + * + * The mbuf library provides the ability to create and destroy buffers + * that may be used by the RTE application to store message + * buffers. The message buffers are stored in a mempool, using the + * RTE mempool library. + * + * The preferred way to create a mbuf pool is to use + * rte_pktmbuf_pool_create(). However, in some situations, an + * application may want to have more control (ex: populate the pool with + * specific memory), in this case it is possible to use functions from + * rte_mempool. See how rte_pktmbuf_pool_create() is implemented for + * details. + * + * This library provides an API to allocate/free packet mbufs, which are + * used to carry network packets. + * + * To understand the concepts of packet buffers or mbufs, you + * should read "TCP/IP Illustrated, Volume 2: The Implementation, + * Addison-Wesley, 1995, ISBN 0-201-63354-X from Richard Stevens" + * http://www.kohala.com/start/tcpipiv2.html + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Packet Offload Features Flags. It also carry packet type information. + * Critical resources. Both rx/tx shared these bits. Be cautious on any change + * + * - RX flags start at bit position zero, and get added to the left of previous + * flags. + * - The most-significant 3 bits are reserved for generic mbuf flags + * - TX flags therefore start at bit position 60 (i.e. 63-3), and new flags get + * added to the right of the previously defined flags i.e. they should count + * downwards, not upwards. + * + * Keep these flags synchronized with rte_get_rx_ol_flag_name() and + * rte_get_tx_ol_flag_name(). + */ + +/** + * The RX packet is a 802.1q VLAN packet, and the tci has been + * saved in in mbuf->vlan_tci. + * If the flag PKT_RX_VLAN_STRIPPED is also present, the VLAN + * header has been stripped from mbuf data, else it is still + * present. + */ +#define PKT_RX_VLAN (1ULL << 0) + +#define PKT_RX_RSS_HASH (1ULL << 1) /**< RX packet with RSS hash result. */ +#define PKT_RX_FDIR (1ULL << 2) /**< RX packet with FDIR match indicate. */ + +/** + * Deprecated. + * Checking this flag alone is deprecated: check the 2 bits of + * PKT_RX_L4_CKSUM_MASK. + * This flag was set when the L4 checksum of a packet was detected as + * wrong by the hardware. + */ +#define PKT_RX_L4_CKSUM_BAD (1ULL << 3) + +/** + * Deprecated. + * Checking this flag alone is deprecated: check the 2 bits of + * PKT_RX_IP_CKSUM_MASK. + * This flag was set when the IP checksum of a packet was detected as + * wrong by the hardware. + */ +#define PKT_RX_IP_CKSUM_BAD (1ULL << 4) + +#define PKT_RX_EIP_CKSUM_BAD (1ULL << 5) /**< External IP header checksum error. */ + +/** + * A vlan has been stripped by the hardware and its tci is saved in + * mbuf->vlan_tci. This can only happen if vlan stripping is enabled + * in the RX configuration of the PMD. + * When PKT_RX_VLAN_STRIPPED is set, PKT_RX_VLAN must also be set. + */ +#define PKT_RX_VLAN_STRIPPED (1ULL << 6) + +/** + * Mask of bits used to determine the status of RX IP checksum. + * - PKT_RX_IP_CKSUM_UNKNOWN: no information about the RX IP checksum + * - PKT_RX_IP_CKSUM_BAD: the IP checksum in the packet is wrong + * - PKT_RX_IP_CKSUM_GOOD: the IP checksum in the packet is valid + * - PKT_RX_IP_CKSUM_NONE: the IP checksum is not correct in the packet + * data, but the integrity of the IP header is verified. + */ +#define PKT_RX_IP_CKSUM_MASK ((1ULL << 4) | (1ULL << 7)) + +#define PKT_RX_IP_CKSUM_UNKNOWN 0 +#define PKT_RX_IP_CKSUM_BAD (1ULL << 4) +#define PKT_RX_IP_CKSUM_GOOD (1ULL << 7) +#define PKT_RX_IP_CKSUM_NONE ((1ULL << 4) | (1ULL << 7)) + +/** + * Mask of bits used to determine the status of RX L4 checksum. + * - PKT_RX_L4_CKSUM_UNKNOWN: no information about the RX L4 checksum + * - PKT_RX_L4_CKSUM_BAD: the L4 checksum in the packet is wrong + * - PKT_RX_L4_CKSUM_GOOD: the L4 checksum in the packet is valid + * - PKT_RX_L4_CKSUM_NONE: the L4 checksum is not correct in the packet + * data, but the integrity of the L4 data is verified. + */ +#define PKT_RX_L4_CKSUM_MASK ((1ULL << 3) | (1ULL << 8)) + +#define PKT_RX_L4_CKSUM_UNKNOWN 0 +#define PKT_RX_L4_CKSUM_BAD (1ULL << 3) +#define PKT_RX_L4_CKSUM_GOOD (1ULL << 8) +#define PKT_RX_L4_CKSUM_NONE ((1ULL << 3) | (1ULL << 8)) + +#define PKT_RX_IEEE1588_PTP (1ULL << 9) /**< RX IEEE1588 L2 Ethernet PT Packet. */ +#define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/ +#define PKT_RX_FDIR_ID (1ULL << 13) /**< FD id reported if FDIR match. */ +#define PKT_RX_FDIR_FLX (1ULL << 14) /**< Flexible bytes reported if FDIR match. */ + +/** + * The 2 vlans have been stripped by the hardware and their tci are + * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer). + * This can only happen if vlan stripping is enabled in the RX + * configuration of the PMD. If this flag is set, + * When PKT_RX_QINQ_STRIPPED is set, the flags (PKT_RX_VLAN | + * PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ) must also be set. + */ +#define PKT_RX_QINQ_STRIPPED (1ULL << 15) + +/** + * When packets are coalesced by a hardware or virtual driver, this flag + * can be set in the RX mbuf, meaning that the m->tso_segsz field is + * valid and is set to the segment size of original packets. + */ +#define PKT_RX_LRO (1ULL << 16) + +/** + * Indicate that the timestamp field in the mbuf is valid. + */ +#define PKT_RX_TIMESTAMP (1ULL << 17) + +/** + * Indicate that security offload processing was applied on the RX packet. + */ +#define PKT_RX_SEC_OFFLOAD (1ULL << 18) + +/** + * Indicate that security offload processing failed on the RX packet. + */ +#define PKT_RX_SEC_OFFLOAD_FAILED (1ULL << 19) + +/** + * The RX packet is a double VLAN, and the outer tci has been + * saved in in mbuf->vlan_tci_outer. + * If the flag PKT_RX_QINQ_STRIPPED is also present, both VLANs + * headers have been stripped from mbuf data, else they are still + * present. + */ +#define PKT_RX_QINQ (1ULL << 20) + +/* add new RX flags here */ + +/* add new TX flags here */ + +/** + * UDP Fragmentation Offload flag. This flag is used for enabling UDP + * fragmentation in SW or in HW. When use UFO, mbuf->tso_segsz is used + * to store the MSS of UDP fragments. + */ +#define PKT_TX_UDP_SEG (1ULL << 42) + +/** + * Request security offload processing on the TX packet. + */ +#define PKT_TX_SEC_OFFLOAD (1ULL << 43) + +/** + * Offload the MACsec. This flag must be set by the application to enable + * this offload feature for a packet to be transmitted. + */ +#define PKT_TX_MACSEC (1ULL << 44) + +/** + * Bits 45:48 used for the tunnel type. + * The tunnel type must be specified for TSO or checksum on the inner part + * of tunnel packets. + * These flags can be used with PKT_TX_TCP_SEG for TSO, or PKT_TX_xxx_CKSUM. + * The mbuf fields for inner and outer header lengths are required: + * outer_l2_len, outer_l3_len, l2_len, l3_len, l4_len and tso_segsz for TSO. + */ +#define PKT_TX_TUNNEL_VXLAN (0x1ULL << 45) +#define PKT_TX_TUNNEL_GRE (0x2ULL << 45) +#define PKT_TX_TUNNEL_IPIP (0x3ULL << 45) +#define PKT_TX_TUNNEL_GENEVE (0x4ULL << 45) +/** TX packet with MPLS-in-UDP RFC 7510 header. */ +#define PKT_TX_TUNNEL_MPLSINUDP (0x5ULL << 45) +#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45) +/** + * Generic IP encapsulated tunnel type, used for TSO and checksum offload. + * It can be used for tunnels which are not standards or listed above. + * It is preferred to use specific tunnel flags like PKT_TX_TUNNEL_GRE + * or PKT_TX_TUNNEL_IPIP if possible. + * The ethdev must be configured with DEV_TX_OFFLOAD_IP_TNL_TSO. + * Outer and inner checksums are done according to the existing flags like + * PKT_TX_xxx_CKSUM. + * Specific tunnel headers that contain payload length, sequence id + * or checksum are not expected to be updated. + */ +#define PKT_TX_TUNNEL_IP (0xDULL << 45) +/** + * Generic UDP encapsulated tunnel type, used for TSO and checksum offload. + * UDP tunnel type implies outer IP layer. + * It can be used for tunnels which are not standards or listed above. + * It is preferred to use specific tunnel flags like PKT_TX_TUNNEL_VXLAN + * if possible. + * The ethdev must be configured with DEV_TX_OFFLOAD_UDP_TNL_TSO. + * Outer and inner checksums are done according to the existing flags like + * PKT_TX_xxx_CKSUM. + * Specific tunnel headers that contain payload length, sequence id + * or checksum are not expected to be updated. + */ +#define PKT_TX_TUNNEL_UDP (0xEULL << 45) +/* add new TX TUNNEL type here */ +#define PKT_TX_TUNNEL_MASK (0xFULL << 45) + +/** + * Second VLAN insertion (QinQ) flag. + */ +#define PKT_TX_QINQ (1ULL << 49) /**< TX packet with double VLAN inserted. */ +/* this old name is deprecated */ +#define PKT_TX_QINQ_PKT PKT_TX_QINQ + +/** + * TCP segmentation offload. To enable this offload feature for a + * packet to be transmitted on hardware supporting TSO: + * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag implies + * PKT_TX_TCP_CKSUM) + * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 + * - if it's IPv4, set the PKT_TX_IP_CKSUM flag + * - fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz + */ +#define PKT_TX_TCP_SEG (1ULL << 50) + +#define PKT_TX_IEEE1588_TMST (1ULL << 51) /**< TX IEEE1588 packet to timestamp. */ + +/** + * Bits 52+53 used for L4 packet type with checksum enabled: 00: Reserved, + * 01: TCP checksum, 10: SCTP checksum, 11: UDP checksum. To use hardware + * L4 checksum offload, the user needs to: + * - fill l2_len and l3_len in mbuf + * - set the flags PKT_TX_TCP_CKSUM, PKT_TX_SCTP_CKSUM or PKT_TX_UDP_CKSUM + * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 + */ +#define PKT_TX_L4_NO_CKSUM (0ULL << 52) /**< Disable L4 cksum of TX pkt. */ +#define PKT_TX_TCP_CKSUM (1ULL << 52) /**< TCP cksum of TX pkt. computed by NIC. */ +#define PKT_TX_SCTP_CKSUM (2ULL << 52) /**< SCTP cksum of TX pkt. computed by NIC. */ +#define PKT_TX_UDP_CKSUM (3ULL << 52) /**< UDP cksum of TX pkt. computed by NIC. */ +#define PKT_TX_L4_MASK (3ULL << 52) /**< Mask for L4 cksum offload request. */ + +/** + * Offload the IP checksum in the hardware. The flag PKT_TX_IPV4 should + * also be set by the application, although a PMD will only check + * PKT_TX_IP_CKSUM. + * - fill the mbuf offload information: l2_len, l3_len + */ +#define PKT_TX_IP_CKSUM (1ULL << 54) + +/** + * Packet is IPv4. This flag must be set when using any offload feature + * (TSO, L3 or L4 checksum) to tell the NIC that the packet is an IPv4 + * packet. If the packet is a tunneled packet, this flag is related to + * the inner headers. + */ +#define PKT_TX_IPV4 (1ULL << 55) + +/** + * Packet is IPv6. This flag must be set when using an offload feature + * (TSO or L4 checksum) to tell the NIC that the packet is an IPv6 + * packet. If the packet is a tunneled packet, this flag is related to + * the inner headers. + */ +#define PKT_TX_IPV6 (1ULL << 56) + +/** + * TX packet is a 802.1q VLAN packet. + */ +#define PKT_TX_VLAN (1ULL << 57) +/* this old name is deprecated */ +#define PKT_TX_VLAN_PKT PKT_TX_VLAN + +/** + * Offload the IP checksum of an external header in the hardware. The + * flag PKT_TX_OUTER_IPV4 should also be set by the application, although + * a PMD will only check PKT_TX_OUTER_IP_CKSUM. + * - fill the mbuf offload information: outer_l2_len, outer_l3_len + */ +#define PKT_TX_OUTER_IP_CKSUM (1ULL << 58) + +/** + * Packet outer header is IPv4. This flag must be set when using any + * outer offload feature (L3 or L4 checksum) to tell the NIC that the + * outer header of the tunneled packet is an IPv4 packet. + */ +#define PKT_TX_OUTER_IPV4 (1ULL << 59) + +/** + * Packet outer header is IPv6. This flag must be set when using any + * outer offload feature (L4 checksum) to tell the NIC that the outer + * header of the tunneled packet is an IPv6 packet. + */ +#define PKT_TX_OUTER_IPV6 (1ULL << 60) + +/** + * Bitmask of all supported packet Tx offload features flags, + * which can be set for packet. + */ +#define PKT_TX_OFFLOAD_MASK ( \ + PKT_TX_IP_CKSUM | \ + PKT_TX_L4_MASK | \ + PKT_TX_OUTER_IP_CKSUM | \ + PKT_TX_TCP_SEG | \ + PKT_TX_IEEE1588_TMST | \ + PKT_TX_QINQ_PKT | \ + PKT_TX_VLAN_PKT | \ + PKT_TX_TUNNEL_MASK | \ + PKT_TX_MACSEC | \ + PKT_TX_SEC_OFFLOAD) + +/** + * Mbuf having an external buffer attached. shinfo in mbuf must be filled. + */ +#define EXT_ATTACHED_MBUF (1ULL << 61) + +#define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */ + +/** Alignment constraint of mbuf private area. */ +#define RTE_MBUF_PRIV_ALIGN 8 + +/** + * Get the name of a RX offload flag + * + * @param mask + * The mask describing the flag. + * @return + * The name of this flag, or NULL if it's not a valid RX flag. + */ +const char *rte_get_rx_ol_flag_name(uint64_t mask); + +/** + * Dump the list of RX offload flags in a buffer + * + * @param mask + * The mask describing the RX flags. + * @param buf + * The output buffer. + * @param buflen + * The length of the buffer. + * @return + * 0 on success, (-1) on error. + */ +int rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen); + +/** + * Get the name of a TX offload flag + * + * @param mask + * The mask describing the flag. Usually only one bit must be set. + * Several bits can be given if they belong to the same mask. + * Ex: PKT_TX_L4_MASK. + * @return + * The name of this flag, or NULL if it's not a valid TX flag. + */ +const char *rte_get_tx_ol_flag_name(uint64_t mask); + +/** + * Dump the list of TX offload flags in a buffer + * + * @param mask + * The mask describing the TX flags. + * @param buf + * The output buffer. + * @param buflen + * The length of the buffer. + * @return + * 0 on success, (-1) on error. + */ +int rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen); + +/** + * Some NICs need at least 2KB buffer to RX standard Ethernet frame without + * splitting it into multiple segments. + * So, for mbufs that planned to be involved into RX/TX, the recommended + * minimal buffer length is 2KB + RTE_PKTMBUF_HEADROOM. + */ +#define RTE_MBUF_DEFAULT_DATAROOM 2048 +#define RTE_MBUF_DEFAULT_BUF_SIZE \ + (RTE_MBUF_DEFAULT_DATAROOM + RTE_PKTMBUF_HEADROOM) + +/* define a set of marker types that can be used to refer to set points in the + * mbuf */ +__extension__ +typedef void *MARKER[0]; /**< generic marker for a point in a structure */ +__extension__ +typedef uint8_t MARKER8[0]; /**< generic marker with 1B alignment */ +__extension__ +typedef uint64_t MARKER64[0]; /**< marker that allows us to overwrite 8 bytes + * with a single assignment */ + +/** + * The generic rte_mbuf, containing a packet mbuf. + */ +struct rte_mbuf { + MARKER cacheline0; + + void *buf_addr; /**< Virtual address of segment buffer. */ + /** + * Physical address of segment buffer. + * Force alignment to 8-bytes, so as to ensure we have the exact + * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes + * working on vector drivers easier. + */ + RTE_STD_C11 + union { + rte_iova_t buf_iova; + rte_iova_t buf_physaddr; /**< deprecated */ + } __rte_aligned(sizeof(rte_iova_t)); + + /* next 8 bytes are initialised on RX descriptor rearm */ + MARKER64 rearm_data; + uint16_t data_off; + + /** + * Reference counter. Its size should at least equal to the size + * of port field (16 bits), to support zero-copy broadcast. + * It should only be accessed using the following functions: + * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and + * rte_mbuf_refcnt_set(). The functionality of these functions (atomic, + * or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC + * config option. + */ + RTE_STD_C11 + union { + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ + uint16_t refcnt; /**< Non-atomically accessed refcnt */ + }; + uint16_t nb_segs; /**< Number of segments. */ + + /** Input port (16 bits to support more than 256 virtual ports). */ + uint16_t port; + + uint64_t ol_flags; /**< Offload features. */ + + /* remaining bytes are set on RX when pulling packet from descriptor */ + MARKER rx_descriptor_fields1; + + /* + * The packet type, which is the combination of outer/inner L2, L3, L4 + * and tunnel types. The packet_type is about data really present in the + * mbuf. Example: if vlan stripping is enabled, a received vlan packet + * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the + * vlan is stripped from the data. + */ + RTE_STD_C11 + union { + uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */ + struct { + uint32_t l2_type:4; /**< (Outer) L2 type. */ + uint32_t l3_type:4; /**< (Outer) L3 type. */ + uint32_t l4_type:4; /**< (Outer) L4 type. */ + uint32_t tun_type:4; /**< Tunnel type. */ + RTE_STD_C11 + union { + uint8_t inner_esp_next_proto; + /**< ESP next protocol type, valid if + * RTE_PTYPE_TUNNEL_ESP tunnel type is set + * on both Tx and Rx. + */ + __extension__ + struct { + uint8_t inner_l2_type:4; + /**< Inner L2 type. */ + uint8_t inner_l3_type:4; + /**< Inner L3 type. */ + }; + }; + uint32_t inner_l4_type:4; /**< Inner L4 type. */ + }; + }; + + uint32_t pkt_len; /**< Total pkt len: sum of all segments. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + /** VLAN TCI (CPU order), valid if PKT_RX_VLAN is set. */ + uint16_t vlan_tci; + + union { + uint32_t rss; /**< RSS hash result if RSS enabled */ + struct { + RTE_STD_C11 + union { + struct { + uint16_t hash; + uint16_t id; + }; + uint32_t lo; + /**< Second 4 flexible bytes */ + }; + uint32_t hi; + /**< First 4 flexible bytes or FD ID, dependent on + PKT_RX_FDIR_* flag in ol_flags. */ + } fdir; /**< Filter identifier if FDIR enabled */ + struct { + uint32_t lo; + uint32_t hi; + } sched; /**< Hierarchical scheduler */ + uint32_t usr; /**< User defined tags. See rte_distributor_process() */ + } hash; /**< hash information */ + + /** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ is set. */ + uint16_t vlan_tci_outer; + + uint16_t buf_len; /**< Length of segment buffer. */ + + /** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference + * are not normalized but are always the same for a given port. + */ + uint64_t timestamp; + + /* second cache line - fields only used in slow path or on TX */ + MARKER cacheline1 __rte_cache_min_aligned; + + RTE_STD_C11 + union { + void *userdata; /**< Can be used for external metadata */ + uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */ + }; + + struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */ + struct rte_mbuf *next; /**< Next segment of scattered packet. */ + + /* fields to support TX offloads */ + RTE_STD_C11 + union { + uint64_t tx_offload; /**< combined for easy fetch */ + __extension__ + struct { + uint64_t l2_len:7; + /**< L2 (MAC) Header Length for non-tunneling pkt. + * Outer_L4_len + ... + Inner_L2_len for tunneling pkt. + */ + uint64_t l3_len:9; /**< L3 (IP) Header Length. */ + uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */ + uint64_t tso_segsz:16; /**< TCP TSO segment size */ + + /* fields for TX offloading of tunnels */ + uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */ + uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */ + + /* uint64_t unused:8; */ + }; + }; + + /** Size of the application private data. In case of an indirect + * mbuf, it stores the direct mbuf private data size. */ + uint16_t priv_size; + + /** Timesync flags for use with IEEE1588. */ + uint16_t timesync; + + /** Sequence number. See also rte_reorder_insert(). */ + uint32_t seqn; + + /** Shared data for external buffer attached to mbuf. See + * rte_pktmbuf_attach_extbuf(). + */ + struct rte_mbuf_ext_shared_info *shinfo; + +} __rte_cache_aligned; + +/** + * Function typedef of callback to free externally attached buffer. + */ +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque); + +/** + * Shared data at the end of an external buffer. + */ +struct rte_mbuf_ext_shared_info { + rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */ + void *fcb_opaque; /**< Free callback argument */ + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ +}; + +/**< Maximum number of nb_segs allowed. */ +#define RTE_MBUF_MAX_NB_SEGS UINT16_MAX + +/** + * Prefetch the first part of the mbuf + * + * The first 64 bytes of the mbuf corresponds to fields that are used early + * in the receive path. If the cache line of the architecture is higher than + * 64B, the second part will also be prefetched. + * + * @param m + * The pointer to the mbuf. + */ +static inline void +rte_mbuf_prefetch_part1(struct rte_mbuf *m) +{ + rte_prefetch0(&m->cacheline0); +} + +/** + * Prefetch the second part of the mbuf + * + * The next 64 bytes of the mbuf corresponds to fields that are used in the + * transmit path. If the cache line of the architecture is higher than 64B, + * this function does nothing as it is expected that the full mbuf is + * already in cache. + * + * @param m + * The pointer to the mbuf. + */ +static inline void +rte_mbuf_prefetch_part2(struct rte_mbuf *m) +{ +#if RTE_CACHE_LINE_SIZE == 64 + rte_prefetch0(&m->cacheline1); +#else + RTE_SET_USED(m); +#endif +} + + +static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp); + +/** + * Return the IO address of the beginning of the mbuf data + * + * @param mb + * The pointer to the mbuf. + * @return + * The IO address of the beginning of the mbuf data + */ +static inline rte_iova_t +rte_mbuf_data_iova(const struct rte_mbuf *mb) +{ + return mb->buf_iova + mb->data_off; +} + +__rte_deprecated +static inline phys_addr_t +rte_mbuf_data_dma_addr(const struct rte_mbuf *mb) +{ + return rte_mbuf_data_iova(mb); +} + +/** + * Return the default IO address of the beginning of the mbuf data + * + * This function is used by drivers in their receive function, as it + * returns the location where data should be written by the NIC, taking + * the default headroom in account. + * + * @param mb + * The pointer to the mbuf. + * @return + * The IO address of the beginning of the mbuf data + */ +static inline rte_iova_t +rte_mbuf_data_iova_default(const struct rte_mbuf *mb) +{ + return mb->buf_iova + RTE_PKTMBUF_HEADROOM; +} + +__rte_deprecated +static inline phys_addr_t +rte_mbuf_data_dma_addr_default(const struct rte_mbuf *mb) +{ + return rte_mbuf_data_iova_default(mb); +} + +/** + * Return the mbuf owning the data buffer address of an indirect mbuf. + * + * @param mi + * The pointer to the indirect mbuf. + * @return + * The address of the direct mbuf corresponding to buffer_addr. + */ +static inline struct rte_mbuf * +rte_mbuf_from_indirect(struct rte_mbuf *mi) +{ + return (struct rte_mbuf *)RTE_PTR_SUB(mi->buf_addr, sizeof(*mi) + mi->priv_size); +} + +/** + * Return the buffer address embedded in the given mbuf. + * + * @param md + * The pointer to the mbuf. + * @return + * The address of the data buffer owned by the mbuf. + */ +static inline char * +rte_mbuf_to_baddr(struct rte_mbuf *md) +{ + char *buffer_addr; + buffer_addr = (char *)md + sizeof(*md) + rte_pktmbuf_priv_size(md->pool); + return buffer_addr; +} + +/** + * Return the starting address of the private data area embedded in + * the given mbuf. + * + * Note that no check is made to ensure that a private data area + * actually exists in the supplied mbuf. + * + * @param m + * The pointer to the mbuf. + * @return + * The starting address of the private data area of the given mbuf. + */ +static inline void * __rte_experimental +rte_mbuf_to_priv(struct rte_mbuf *m) +{ + return RTE_PTR_ADD(m, sizeof(struct rte_mbuf)); +} + +/** + * Returns TRUE if given mbuf is cloned by mbuf indirection, or FALSE + * otherwise. + * + * If a mbuf has its data in another mbuf and references it by mbuf + * indirection, this mbuf can be defined as a cloned mbuf. + */ +#define RTE_MBUF_CLONED(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) + +/** + * Deprecated. + * Use RTE_MBUF_CLONED(). + */ +#define RTE_MBUF_INDIRECT(mb) RTE_MBUF_CLONED(mb) + +/** + * Returns TRUE if given mbuf has an external buffer, or FALSE otherwise. + * + * External buffer is a user-provided anonymous buffer. + */ +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF) + +/** + * Returns TRUE if given mbuf is direct, or FALSE otherwise. + * + * If a mbuf embeds its own data after the rte_mbuf structure, this mbuf + * can be defined as a direct mbuf. + */ +#define RTE_MBUF_DIRECT(mb) \ + (!((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF))) + +/** + * Private data in case of pktmbuf pool. + * + * A structure that contains some pktmbuf_pool-specific data that are + * appended after the mempool structure (in private data). + */ +struct rte_pktmbuf_pool_private { + uint16_t mbuf_data_room_size; /**< Size of data space in each mbuf. */ + uint16_t mbuf_priv_size; /**< Size of private area in each mbuf. */ +}; + +#ifdef RTE_LIBRTE_MBUF_DEBUG + +/** check mbuf type in debug mode */ +#define __rte_mbuf_sanity_check(m, is_h) rte_mbuf_sanity_check(m, is_h) + +#else /* RTE_LIBRTE_MBUF_DEBUG */ + +/** check mbuf type in debug mode */ +#define __rte_mbuf_sanity_check(m, is_h) do { } while (0) + +#endif /* RTE_LIBRTE_MBUF_DEBUG */ + +#ifdef RTE_MBUF_REFCNT_ATOMIC + +/** + * Reads the value of an mbuf's refcnt. + * @param m + * Mbuf to read + * @return + * Reference count number. + */ +static inline uint16_t +rte_mbuf_refcnt_read(const struct rte_mbuf *m) +{ + return (uint16_t)(rte_atomic16_read(&m->refcnt_atomic)); +} + +/** + * Sets an mbuf's refcnt to a defined value. + * @param m + * Mbuf to update + * @param new_value + * Value set + */ +static inline void +rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value) +{ + rte_atomic16_set(&m->refcnt_atomic, (int16_t)new_value); +} + +/* internal */ +static inline uint16_t +__rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value) +{ + return (uint16_t)(rte_atomic16_add_return(&m->refcnt_atomic, value)); +} + +/** + * Adds given value to an mbuf's refcnt and returns its new value. + * @param m + * Mbuf to update + * @param value + * Value to add/subtract + * @return + * Updated value + */ +static inline uint16_t +rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value) +{ + /* + * The atomic_add is an expensive operation, so we don't want to + * call it in the case where we know we are the uniq holder of + * this mbuf (i.e. ref_cnt == 1). Otherwise, an atomic + * operation has to be used because concurrent accesses on the + * reference counter can occur. + */ + if (likely(rte_mbuf_refcnt_read(m) == 1)) { + ++value; + rte_mbuf_refcnt_set(m, (uint16_t)value); + return (uint16_t)value; + } + + return __rte_mbuf_refcnt_update(m, value); +} + +#else /* ! RTE_MBUF_REFCNT_ATOMIC */ + +/* internal */ +static inline uint16_t +__rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value) +{ + m->refcnt = (uint16_t)(m->refcnt + value); + return m->refcnt; +} + +/** + * Adds given value to an mbuf's refcnt and returns its new value. + */ +static inline uint16_t +rte_mbuf_refcnt_update(struct rte_mbuf *m, int16_t value) +{ + return __rte_mbuf_refcnt_update(m, value); +} + +/** + * Reads the value of an mbuf's refcnt. + */ +static inline uint16_t +rte_mbuf_refcnt_read(const struct rte_mbuf *m) +{ + return m->refcnt; +} + +/** + * Sets an mbuf's refcnt to the defined value. + */ +static inline void +rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value) +{ + m->refcnt = new_value; +} + +#endif /* RTE_MBUF_REFCNT_ATOMIC */ + +/** + * Reads the refcnt of an external buffer. + * + * @param shinfo + * Shared data of the external buffer. + * @return + * Reference count number. + */ +static inline uint16_t +rte_mbuf_ext_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo) +{ + return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic)); +} + +/** + * Set refcnt of an external buffer. + * + * @param shinfo + * Shared data of the external buffer. + * @param new_value + * Value set + */ +static inline void +rte_mbuf_ext_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo, + uint16_t new_value) +{ + rte_atomic16_set(&shinfo->refcnt_atomic, (int16_t)new_value); +} + +/** + * Add given value to refcnt of an external buffer and return its new + * value. + * + * @param shinfo + * Shared data of the external buffer. + * @param value + * Value to add/subtract + * @return + * Updated value + */ +static inline uint16_t +rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo, + int16_t value) +{ + if (likely(rte_mbuf_ext_refcnt_read(shinfo) == 1)) { + ++value; + rte_mbuf_ext_refcnt_set(shinfo, (uint16_t)value); + return (uint16_t)value; + } + + return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value); +} + +/** Mbuf prefetch */ +#define RTE_MBUF_PREFETCH_TO_FREE(m) do { \ + if ((m) != NULL) \ + rte_prefetch0(m); \ +} while (0) + + +/** + * Sanity checks on an mbuf. + * + * Check the consistency of the given mbuf. The function will cause a + * panic if corruption is detected. + * + * @param m + * The mbuf to be checked. + * @param is_header + * True if the mbuf is a packet header, false if it is a sub-segment + * of a packet (in this case, some fields like nb_segs are not checked) + */ +void +rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header); + +#define MBUF_RAW_ALLOC_CHECK(m) do { \ + RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1); \ + RTE_ASSERT((m)->next == NULL); \ + RTE_ASSERT((m)->nb_segs == 1); \ + __rte_mbuf_sanity_check(m, 0); \ +} while (0) + +/** + * Allocate an uninitialized mbuf from mempool *mp*. + * + * This function can be used by PMDs (especially in RX functions) to + * allocate an uninitialized mbuf. The driver is responsible of + * initializing all the required fields. See rte_pktmbuf_reset(). + * For standard needs, prefer rte_pktmbuf_alloc(). + * + * The caller can expect that the following fields of the mbuf structure + * are initialized: buf_addr, buf_iova, buf_len, refcnt=1, nb_segs=1, + * next=NULL, pool, priv_size. The other fields must be initialized + * by the caller. + * + * @param mp + * The mempool from which mbuf is allocated. + * @return + * - The pointer to the new mbuf on success. + * - NULL if allocation failed. + */ +static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp) +{ + struct rte_mbuf *m; + + if (rte_mempool_get(mp, (void **)&m) < 0) + return NULL; + MBUF_RAW_ALLOC_CHECK(m); + return m; +} + +/** + * Put mbuf back into its original mempool. + * + * The caller must ensure that the mbuf is direct and properly + * reinitialized (refcnt=1, next=NULL, nb_segs=1), as done by + * rte_pktmbuf_prefree_seg(). + * + * This function should be used with care, when optimization is + * required. For standard needs, prefer rte_pktmbuf_free() or + * rte_pktmbuf_free_seg(). + * + * @param m + * The mbuf to be freed. + */ +static __rte_always_inline void +rte_mbuf_raw_free(struct rte_mbuf *m) +{ + RTE_ASSERT(RTE_MBUF_DIRECT(m)); + RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1); + RTE_ASSERT(m->next == NULL); + RTE_ASSERT(m->nb_segs == 1); + __rte_mbuf_sanity_check(m, 0); + rte_mempool_put(m->pool, m); +} + +/* compat with older versions */ +__rte_deprecated +static inline void +__rte_mbuf_raw_free(struct rte_mbuf *m) +{ + rte_mbuf_raw_free(m); +} + +/** + * The packet mbuf constructor. + * + * This function initializes some fields in the mbuf structure that are + * not modified by the user once created (origin pool, buffer start + * address, and so on). This function is given as a callback function to + * rte_mempool_obj_iter() or rte_mempool_create() at pool creation time. + * + * @param mp + * The mempool from which mbufs originate. + * @param opaque_arg + * A pointer that can be used by the user to retrieve useful information + * for mbuf initialization. This pointer is the opaque argument passed to + * rte_mempool_obj_iter() or rte_mempool_create(). + * @param m + * The mbuf to initialize. + * @param i + * The index of the mbuf in the pool table. + */ +void rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg, + void *m, unsigned i); + + +/** + * A packet mbuf pool constructor. + * + * This function initializes the mempool private data in the case of a + * pktmbuf pool. This private data is needed by the driver. The + * function must be called on the mempool before it is used, or it + * can be given as a callback function to rte_mempool_create() at + * pool creation. It can be extended by the user, for example, to + * provide another packet size. + * + * @param mp + * The mempool from which mbufs originate. + * @param opaque_arg + * A pointer that can be used by the user to retrieve useful information + * for mbuf initialization. This pointer is the opaque argument passed to + * rte_mempool_create(). + */ +void rte_pktmbuf_pool_init(struct rte_mempool *mp, void *opaque_arg); + +/** + * Create a mbuf pool. + * + * This function creates and initializes a packet mbuf pool. It is + * a wrapper to rte_mempool functions. + * + * @param name + * The name of the mbuf pool. + * @param n + * The number of elements in the mbuf pool. The optimum size (in terms + * of memory usage) for a mempool is when n is a power of two minus one: + * n = (2^q - 1). + * @param cache_size + * Size of the per-core object cache. See rte_mempool_create() for + * details. + * @param priv_size + * Size of application private are between the rte_mbuf structure + * and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN. + * @param data_room_size + * Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM. + * @param socket_id + * The socket identifier where the memory should be allocated. The + * value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the + * reserved zone. + * @return + * The pointer to the new allocated mempool, on success. NULL on error + * with rte_errno set appropriately. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - cache size provided is too large, or priv_size is not aligned. + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_mempool * +rte_pktmbuf_pool_create(const char *name, unsigned n, + unsigned cache_size, uint16_t priv_size, uint16_t data_room_size, + int socket_id); + +/** + * Create a mbuf pool with a given mempool ops name + * + * This function creates and initializes a packet mbuf pool. It is + * a wrapper to rte_mempool functions. + * + * @param name + * The name of the mbuf pool. + * @param n + * The number of elements in the mbuf pool. The optimum size (in terms + * of memory usage) for a mempool is when n is a power of two minus one: + * n = (2^q - 1). + * @param cache_size + * Size of the per-core object cache. See rte_mempool_create() for + * details. + * @param priv_size + * Size of application private are between the rte_mbuf structure + * and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN. + * @param data_room_size + * Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM. + * @param socket_id + * The socket identifier where the memory should be allocated. The + * value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the + * reserved zone. + * @param ops_name + * The mempool ops name to be used for this mempool instead of + * default mempool. The value can be *NULL* to use default mempool. + * @return + * The pointer to the new allocated mempool, on success. NULL on error + * with rte_errno set appropriately. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - cache size provided is too large, or priv_size is not aligned. + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_mempool * +rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n, + unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, + int socket_id, const char *ops_name); + +/** + * Get the data room size of mbufs stored in a pktmbuf_pool + * + * The data room size is the amount of data that can be stored in a + * mbuf including the headroom (RTE_PKTMBUF_HEADROOM). + * + * @param mp + * The packet mbuf pool. + * @return + * The data room size of mbufs stored in this mempool. + */ +static inline uint16_t +rte_pktmbuf_data_room_size(struct rte_mempool *mp) +{ + struct rte_pktmbuf_pool_private *mbp_priv; + + mbp_priv = (struct rte_pktmbuf_pool_private *)rte_mempool_get_priv(mp); + return mbp_priv->mbuf_data_room_size; +} + +/** + * Get the application private size of mbufs stored in a pktmbuf_pool + * + * The private size of mbuf is a zone located between the rte_mbuf + * structure and the data buffer where an application can store data + * associated to a packet. + * + * @param mp + * The packet mbuf pool. + * @return + * The private size of mbufs stored in this mempool. + */ +static inline uint16_t +rte_pktmbuf_priv_size(struct rte_mempool *mp) +{ + struct rte_pktmbuf_pool_private *mbp_priv; + + mbp_priv = (struct rte_pktmbuf_pool_private *)rte_mempool_get_priv(mp); + return mbp_priv->mbuf_priv_size; +} + +/** + * Reset the data_off field of a packet mbuf to its default value. + * + * The given mbuf must have only one segment, which should be empty. + * + * @param m + * The packet mbuf's data_off field has to be reset. + */ +static inline void rte_pktmbuf_reset_headroom(struct rte_mbuf *m) +{ + m->data_off = (uint16_t)RTE_MIN((uint16_t)RTE_PKTMBUF_HEADROOM, + (uint16_t)m->buf_len); +} + +/** + * Reset the fields of a packet mbuf to their default values. + * + * The given mbuf must have only one segment. + * + * @param m + * The packet mbuf to be resetted. + */ +#define MBUF_INVALID_PORT UINT16_MAX + +static inline void rte_pktmbuf_reset(struct rte_mbuf *m) +{ + m->next = NULL; + m->pkt_len = 0; + m->tx_offload = 0; + m->vlan_tci = 0; + m->vlan_tci_outer = 0; + m->nb_segs = 1; + m->port = MBUF_INVALID_PORT; + + m->ol_flags = 0; + m->packet_type = 0; + rte_pktmbuf_reset_headroom(m); + + m->data_len = 0; + __rte_mbuf_sanity_check(m, 1); +} + +/** + * Allocate a new mbuf from a mempool. + * + * This new mbuf contains one segment, which has a length of 0. The pointer + * to data is initialized to have some bytes of headroom in the buffer + * (if buffer size allows). + * + * @param mp + * The mempool from which the mbuf is allocated. + * @return + * - The pointer to the new mbuf on success. + * - NULL if allocation failed. + */ +static inline struct rte_mbuf *rte_pktmbuf_alloc(struct rte_mempool *mp) +{ + struct rte_mbuf *m; + if ((m = rte_mbuf_raw_alloc(mp)) != NULL) + rte_pktmbuf_reset(m); + return m; +} + +/** + * Allocate a bulk of mbufs, initialize refcnt and reset the fields to default + * values. + * + * @param pool + * The mempool from which mbufs are allocated. + * @param mbufs + * Array of pointers to mbufs + * @param count + * Array size + * @return + * - 0: Success + * - -ENOENT: Not enough entries in the mempool; no mbufs are retrieved. + */ +static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool, + struct rte_mbuf **mbufs, unsigned count) +{ + unsigned idx = 0; + int rc; + + rc = rte_mempool_get_bulk(pool, (void **)mbufs, count); + if (unlikely(rc)) + return rc; + + /* To understand duff's device on loop unwinding optimization, see + * https://en.wikipedia.org/wiki/Duff's_device. + * Here while() loop is used rather than do() while{} to avoid extra + * check if count is zero. + */ + switch (count % 4) { + case 0: + while (idx != count) { + MBUF_RAW_ALLOC_CHECK(mbufs[idx]); + rte_pktmbuf_reset(mbufs[idx]); + idx++; + /* fall-through */ + case 3: + MBUF_RAW_ALLOC_CHECK(mbufs[idx]); + rte_pktmbuf_reset(mbufs[idx]); + idx++; + /* fall-through */ + case 2: + MBUF_RAW_ALLOC_CHECK(mbufs[idx]); + rte_pktmbuf_reset(mbufs[idx]); + idx++; + /* fall-through */ + case 1: + MBUF_RAW_ALLOC_CHECK(mbufs[idx]); + rte_pktmbuf_reset(mbufs[idx]); + idx++; + /* fall-through */ + } + } + return 0; +} + +/** + * Initialize shared data at the end of an external buffer before attaching + * to a mbuf by ``rte_pktmbuf_attach_extbuf()``. This is not a mandatory + * initialization but a helper function to simply spare a few bytes at the + * end of the buffer for shared data. If shared data is allocated + * separately, this should not be called but application has to properly + * initialize the shared data according to its need. + * + * Free callback and its argument is saved and the refcnt is set to 1. + * + * @warning + * The value of buf_len will be reduced to RTE_PTR_DIFF(shinfo, buf_addr) + * after this initialization. This shall be used for + * ``rte_pktmbuf_attach_extbuf()`` + * + * @param buf_addr + * The pointer to the external buffer. + * @param [in,out] buf_len + * The pointer to length of the external buffer. Input value must be + * larger than the size of ``struct rte_mbuf_ext_shared_info`` and + * padding for alignment. If not enough, this function will return NULL. + * Adjusted buffer length will be returned through this pointer. + * @param free_cb + * Free callback function to call when the external buffer needs to be + * freed. + * @param fcb_opaque + * Argument for the free callback function. + * + * @return + * A pointer to the initialized shared data on success, return NULL + * otherwise. + */ +static inline struct rte_mbuf_ext_shared_info * +rte_pktmbuf_ext_shinfo_init_helper(void *buf_addr, uint16_t *buf_len, + rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque) +{ + struct rte_mbuf_ext_shared_info *shinfo; + void *buf_end = RTE_PTR_ADD(buf_addr, *buf_len); + void *addr; + + addr = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)), + sizeof(uintptr_t)); + if (addr <= buf_addr) + return NULL; + + shinfo = (struct rte_mbuf_ext_shared_info *)addr; + shinfo->free_cb = free_cb; + shinfo->fcb_opaque = fcb_opaque; + rte_mbuf_ext_refcnt_set(shinfo, 1); + + *buf_len = (uint16_t)RTE_PTR_DIFF(shinfo, buf_addr); + return shinfo; +} + +/** + * Attach an external buffer to a mbuf. + * + * User-managed anonymous buffer can be attached to an mbuf. When attaching + * it, corresponding free callback function and its argument should be + * provided via shinfo. This callback function will be called once all the + * mbufs are detached from the buffer (refcnt becomes zero). + * + * The headroom for the attaching mbuf will be set to zero and this can be + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()`` + * or ``rte_pktmbuf_reset_headroom()`` might be used. + * + * More mbufs can be attached to the same external buffer by + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by + * this API. + * + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or + * ``rte_pktmbuf_detach()``. + * + * Memory for shared data must be provided and user must initialize all of + * the content properly, escpecially free callback and refcnt. The pointer + * of shared data will be stored in m->shinfo. + * ``rte_pktmbuf_ext_shinfo_init_helper`` can help to simply spare a few + * bytes at the end of buffer for the shared data, store free callback and + * its argument and set the refcnt to 1. The following is an example: + * + * struct rte_mbuf_ext_shared_info *shinfo = + * rte_pktmbuf_ext_shinfo_init_helper(buf_addr, &buf_len, + * free_cb, fcb_arg); + * rte_pktmbuf_attach_extbuf(m, buf_addr, buf_iova, buf_len, shinfo); + * rte_pktmbuf_reset_headroom(m); + * rte_pktmbuf_adj(m, data_len); + * + * Attaching an external buffer is quite similar to mbuf indirection in + * replacing buffer addresses and length of a mbuf, but a few differences: + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be + * 2 as long as the direct mbuf itself isn't freed after the attachment. + * In such cases, the buffer area of a direct mbuf must be read-only. But + * external buffer has its own refcnt and it starts from 1. Unless + * multiple mbufs are attached to a mbuf having an external buffer, the + * external buffer is writable. + * - There's no need to allocate buffer from a mempool. Any buffer can be + * attached with appropriate free callback and its IO address. + * - Smaller metadata is required to maintain shared data such as refcnt. + * + * @warning + * @b EXPERIMENTAL: This API may change without prior notice. + * Once external buffer is enabled by allowing experimental API, + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer + * exclusive. A mbuf can be considered direct if it is neither indirect nor + * having external buffer. + * + * @param m + * The pointer to the mbuf. + * @param buf_addr + * The pointer to the external buffer. + * @param buf_iova + * IO address of the external buffer. + * @param buf_len + * The size of the external buffer. + * @param shinfo + * User-provided memory for shared data of the external buffer. + */ +static inline void __rte_experimental +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr, + rte_iova_t buf_iova, uint16_t buf_len, + struct rte_mbuf_ext_shared_info *shinfo) +{ + /* mbuf should not be read-only */ + RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1); + RTE_ASSERT(shinfo->free_cb != NULL); + + m->buf_addr = buf_addr; + m->buf_iova = buf_iova; + m->buf_len = buf_len; + + m->data_len = 0; + m->data_off = 0; + + m->ol_flags |= EXT_ATTACHED_MBUF; + m->shinfo = shinfo; +} + +/** + * Detach the external buffer attached to a mbuf, same as + * ``rte_pktmbuf_detach()`` + * + * @param m + * The mbuf having external buffer. + */ +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m) + +/** + * Attach packet mbuf to another packet mbuf. + * + * If the mbuf we are attaching to isn't a direct buffer and is attached to + * an external buffer, the mbuf being attached will be attached to the + * external buffer instead of mbuf indirection. + * + * Otherwise, the mbuf will be indirectly attached. After attachment we + * refer the mbuf we attached as 'indirect', while mbuf we attached to as + * 'direct'. The direct mbuf's reference counter is incremented. + * + * Right now, not supported: + * - attachment for already indirect mbuf (e.g. - mi has to be direct). + * - mbuf we trying to attach (mi) is used by someone else + * e.g. it's reference counter is greater then 1. + * + * @param mi + * The indirect packet mbuf. + * @param m + * The packet mbuf we're attaching to. + */ +static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) +{ + RTE_ASSERT(RTE_MBUF_DIRECT(mi) && + rte_mbuf_refcnt_read(mi) == 1); + + if (RTE_MBUF_HAS_EXTBUF(m)) { + rte_mbuf_ext_refcnt_update(m->shinfo, 1); + mi->ol_flags = m->ol_flags; + mi->shinfo = m->shinfo; + } else { + /* if m is not direct, get the mbuf that embeds the data */ + rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1); + mi->priv_size = m->priv_size; + mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; + } + + mi->buf_iova = m->buf_iova; + mi->buf_addr = m->buf_addr; + mi->buf_len = m->buf_len; + + mi->data_off = m->data_off; + mi->data_len = m->data_len; + mi->port = m->port; + mi->vlan_tci = m->vlan_tci; + mi->vlan_tci_outer = m->vlan_tci_outer; + mi->tx_offload = m->tx_offload; + mi->hash = m->hash; + + mi->next = NULL; + mi->pkt_len = mi->data_len; + mi->nb_segs = 1; + mi->packet_type = m->packet_type; + mi->timestamp = m->timestamp; + + __rte_mbuf_sanity_check(mi, 1); + __rte_mbuf_sanity_check(m, 0); +} + +/** + * @internal used by rte_pktmbuf_detach(). + * + * Decrement the reference counter of the external buffer. When the + * reference counter becomes 0, the buffer is freed by pre-registered + * callback. + */ +static inline void +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m) +{ + RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m)); + RTE_ASSERT(m->shinfo != NULL); + + if (rte_mbuf_ext_refcnt_update(m->shinfo, -1) == 0) + m->shinfo->free_cb(m->buf_addr, m->shinfo->fcb_opaque); +} + +/** + * @internal used by rte_pktmbuf_detach(). + * + * Decrement the direct mbuf's reference counter. When the reference + * counter becomes 0, the direct mbuf is freed. + */ +static inline void +__rte_pktmbuf_free_direct(struct rte_mbuf *m) +{ + struct rte_mbuf *md; + + RTE_ASSERT(RTE_MBUF_INDIRECT(m)); + + md = rte_mbuf_from_indirect(m); + + if (rte_mbuf_refcnt_update(md, -1) == 0) { + md->next = NULL; + md->nb_segs = 1; + rte_mbuf_refcnt_set(md, 1); + rte_mbuf_raw_free(md); + } +} + +/** + * Detach a packet mbuf from external buffer or direct buffer. + * + * - decrement refcnt and free the external/direct buffer if refcnt + * becomes zero. + * - restore original mbuf address and length values. + * - reset pktmbuf data and data_len to their default values. + * + * All other fields of the given packet mbuf will be left intact. + * + * @param m + * The indirect attached packet mbuf. + */ +static inline void rte_pktmbuf_detach(struct rte_mbuf *m) +{ + struct rte_mempool *mp = m->pool; + uint32_t mbuf_size, buf_len; + uint16_t priv_size; + + if (RTE_MBUF_HAS_EXTBUF(m)) + __rte_pktmbuf_free_extbuf(m); + else + __rte_pktmbuf_free_direct(m); + + priv_size = rte_pktmbuf_priv_size(mp); + mbuf_size = (uint32_t)(sizeof(struct rte_mbuf) + priv_size); + buf_len = rte_pktmbuf_data_room_size(mp); + + m->priv_size = priv_size; + m->buf_addr = (char *)m + mbuf_size; + m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size; + m->buf_len = (uint16_t)buf_len; + rte_pktmbuf_reset_headroom(m); + m->data_len = 0; + m->ol_flags = 0; +} + +/** + * Decrease reference counter and unlink a mbuf segment + * + * This function does the same than a free, except that it does not + * return the segment to its pool. + * It decreases the reference counter, and if it reaches 0, it is + * detached from its parent for an indirect mbuf. + * + * @param m + * The mbuf to be unlinked + * @return + * - (m) if it is the last reference. It can be recycled or freed. + * - (NULL) if the mbuf still has remaining references on it. + */ +static __rte_always_inline struct rte_mbuf * +rte_pktmbuf_prefree_seg(struct rte_mbuf *m) +{ + __rte_mbuf_sanity_check(m, 0); + + if (likely(rte_mbuf_refcnt_read(m) == 1)) { + + if (!RTE_MBUF_DIRECT(m)) + rte_pktmbuf_detach(m); + + if (m->next != NULL) { + m->next = NULL; + m->nb_segs = 1; + } + + return m; + + } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { + + if (!RTE_MBUF_DIRECT(m)) + rte_pktmbuf_detach(m); + + if (m->next != NULL) { + m->next = NULL; + m->nb_segs = 1; + } + rte_mbuf_refcnt_set(m, 1); + + return m; + } + return NULL; +} + +/* deprecated, replaced by rte_pktmbuf_prefree_seg() */ +__rte_deprecated +static inline struct rte_mbuf * +__rte_pktmbuf_prefree_seg(struct rte_mbuf *m) +{ + return rte_pktmbuf_prefree_seg(m); +} + +/** + * Free a segment of a packet mbuf into its original mempool. + * + * Free an mbuf, without parsing other segments in case of chained + * buffers. + * + * @param m + * The packet mbuf segment to be freed. + */ +static __rte_always_inline void +rte_pktmbuf_free_seg(struct rte_mbuf *m) +{ + m = rte_pktmbuf_prefree_seg(m); + if (likely(m != NULL)) + rte_mbuf_raw_free(m); +} + +/** + * Free a packet mbuf back into its original mempool. + * + * Free an mbuf, and all its segments in case of chained buffers. Each + * segment is added back into its original mempool. + * + * @param m + * The packet mbuf to be freed. If NULL, the function does nothing. + */ +static inline void rte_pktmbuf_free(struct rte_mbuf *m) +{ + struct rte_mbuf *m_next; + + if (m != NULL) + __rte_mbuf_sanity_check(m, 1); + + while (m != NULL) { + m_next = m->next; + rte_pktmbuf_free_seg(m); + m = m_next; + } +} + +/** + * Creates a "clone" of the given packet mbuf. + * + * Walks through all segments of the given packet mbuf, and for each of them: + * - Creates a new packet mbuf from the given pool. + * - Attaches newly created mbuf to the segment. + * Then updates pkt_len and nb_segs of the "clone" packet mbuf to match values + * from the original packet mbuf. + * + * @param md + * The packet mbuf to be cloned. + * @param mp + * The mempool from which the "clone" mbufs are allocated. + * @return + * - The pointer to the new "clone" mbuf on success. + * - NULL if allocation fails. + */ +static inline struct rte_mbuf *rte_pktmbuf_clone(struct rte_mbuf *md, + struct rte_mempool *mp) +{ + struct rte_mbuf *mc, *mi, **prev; + uint32_t pktlen; + uint16_t nseg; + + if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) + return NULL; + + mi = mc; + prev = &mi->next; + pktlen = md->pkt_len; + nseg = 0; + + do { + nseg++; + rte_pktmbuf_attach(mi, md); + *prev = mi; + prev = &mi->next; + } while ((md = md->next) != NULL && + (mi = rte_pktmbuf_alloc(mp)) != NULL); + + *prev = NULL; + mc->nb_segs = nseg; + mc->pkt_len = pktlen; + + /* Allocation of new indirect segment failed */ + if (unlikely (mi == NULL)) { + rte_pktmbuf_free(mc); + return NULL; + } + + __rte_mbuf_sanity_check(mc, 1); + return mc; +} + +/** + * Adds given value to the refcnt of all packet mbuf segments. + * + * Walks through all segments of given packet mbuf and for each of them + * invokes rte_mbuf_refcnt_update(). + * + * @param m + * The packet mbuf whose refcnt to be updated. + * @param v + * The value to add to the mbuf's segments refcnt. + */ +static inline void rte_pktmbuf_refcnt_update(struct rte_mbuf *m, int16_t v) +{ + __rte_mbuf_sanity_check(m, 1); + + do { + rte_mbuf_refcnt_update(m, v); + } while ((m = m->next) != NULL); +} + +/** + * Get the headroom in a packet mbuf. + * + * @param m + * The packet mbuf. + * @return + * The length of the headroom. + */ +static inline uint16_t rte_pktmbuf_headroom(const struct rte_mbuf *m) +{ + __rte_mbuf_sanity_check(m, 0); + return m->data_off; +} + +/** + * Get the tailroom of a packet mbuf. + * + * @param m + * The packet mbuf. + * @return + * The length of the tailroom. + */ +static inline uint16_t rte_pktmbuf_tailroom(const struct rte_mbuf *m) +{ + __rte_mbuf_sanity_check(m, 0); + return (uint16_t)(m->buf_len - rte_pktmbuf_headroom(m) - + m->data_len); +} + +/** + * Get the last segment of the packet. + * + * @param m + * The packet mbuf. + * @return + * The last segment of the given mbuf. + */ +static inline struct rte_mbuf *rte_pktmbuf_lastseg(struct rte_mbuf *m) +{ + __rte_mbuf_sanity_check(m, 1); + while (m->next != NULL) + m = m->next; + return m; +} + +/** + * A macro that points to an offset into the data in the mbuf. + * + * The returned pointer is cast to type t. Before using this + * function, the user must ensure that the first segment is large + * enough to accommodate its data. + * + * @param m + * The packet mbuf. + * @param o + * The offset into the mbuf data. + * @param t + * The type to cast the result into. + */ +#define rte_pktmbuf_mtod_offset(m, t, o) \ + ((t)((char *)(m)->buf_addr + (m)->data_off + (o))) + +/** + * A macro that points to the start of the data in the mbuf. + * + * The returned pointer is cast to type t. Before using this + * function, the user must ensure that the first segment is large + * enough to accommodate its data. + * + * @param m + * The packet mbuf. + * @param t + * The type to cast the result into. + */ +#define rte_pktmbuf_mtod(m, t) rte_pktmbuf_mtod_offset(m, t, 0) + +/** + * A macro that returns the IO address that points to an offset of the + * start of the data in the mbuf + * + * @param m + * The packet mbuf. + * @param o + * The offset into the data to calculate address from. + */ +#define rte_pktmbuf_iova_offset(m, o) \ + (rte_iova_t)((m)->buf_iova + (m)->data_off + (o)) + +/* deprecated */ +#define rte_pktmbuf_mtophys_offset(m, o) \ + rte_pktmbuf_iova_offset(m, o) + +/** + * A macro that returns the IO address that points to the start of the + * data in the mbuf + * + * @param m + * The packet mbuf. + */ +#define rte_pktmbuf_iova(m) rte_pktmbuf_iova_offset(m, 0) + +/* deprecated */ +#define rte_pktmbuf_mtophys(m) rte_pktmbuf_iova(m) + +/** + * A macro that returns the length of the packet. + * + * The value can be read or assigned. + * + * @param m + * The packet mbuf. + */ +#define rte_pktmbuf_pkt_len(m) ((m)->pkt_len) + +/** + * A macro that returns the length of the segment. + * + * The value can be read or assigned. + * + * @param m + * The packet mbuf. + */ +#define rte_pktmbuf_data_len(m) ((m)->data_len) + +/** + * Prepend len bytes to an mbuf data area. + * + * Returns a pointer to the new + * data start address. If there is not enough headroom in the first + * segment, the function will return NULL, without modifying the mbuf. + * + * @param m + * The pkt mbuf. + * @param len + * The amount of data to prepend (in bytes). + * @return + * A pointer to the start of the newly prepended data, or + * NULL if there is not enough headroom space in the first segment + */ +static inline char *rte_pktmbuf_prepend(struct rte_mbuf *m, + uint16_t len) +{ + __rte_mbuf_sanity_check(m, 1); + + if (unlikely(len > rte_pktmbuf_headroom(m))) + return NULL; + + /* NB: elaborating the subtraction like this instead of using + * -= allows us to ensure the result type is uint16_t + * avoiding compiler warnings on gcc 8.1 at least */ + m->data_off = (uint16_t)(m->data_off - len); + m->data_len = (uint16_t)(m->data_len + len); + m->pkt_len = (m->pkt_len + len); + + return (char *)m->buf_addr + m->data_off; +} + +/** + * Append len bytes to an mbuf. + * + * Append len bytes to an mbuf and return a pointer to the start address + * of the added data. If there is not enough tailroom in the last + * segment, the function will return NULL, without modifying the mbuf. + * + * @param m + * The packet mbuf. + * @param len + * The amount of data to append (in bytes). + * @return + * A pointer to the start of the newly appended data, or + * NULL if there is not enough tailroom space in the last segment + */ +static inline char *rte_pktmbuf_append(struct rte_mbuf *m, uint16_t len) +{ + void *tail; + struct rte_mbuf *m_last; + + __rte_mbuf_sanity_check(m, 1); + + m_last = rte_pktmbuf_lastseg(m); + if (unlikely(len > rte_pktmbuf_tailroom(m_last))) + return NULL; + + tail = (char *)m_last->buf_addr + m_last->data_off + m_last->data_len; + m_last->data_len = (uint16_t)(m_last->data_len + len); + m->pkt_len = (m->pkt_len + len); + return (char*) tail; +} + +/** + * Remove len bytes at the beginning of an mbuf. + * + * Returns a pointer to the start address of the new data area. If the + * length is greater than the length of the first segment, then the + * function will fail and return NULL, without modifying the mbuf. + * + * @param m + * The packet mbuf. + * @param len + * The amount of data to remove (in bytes). + * @return + * A pointer to the new start of the data. + */ +static inline char *rte_pktmbuf_adj(struct rte_mbuf *m, uint16_t len) +{ + __rte_mbuf_sanity_check(m, 1); + + if (unlikely(len > m->data_len)) + return NULL; + + /* NB: elaborating the addition like this instead of using + * += allows us to ensure the result type is uint16_t + * avoiding compiler warnings on gcc 8.1 at least */ + m->data_len = (uint16_t)(m->data_len - len); + m->data_off = (uint16_t)(m->data_off + len); + m->pkt_len = (m->pkt_len - len); + return (char *)m->buf_addr + m->data_off; +} + +/** + * Remove len bytes of data at the end of the mbuf. + * + * If the length is greater than the length of the last segment, the + * function will fail and return -1 without modifying the mbuf. + * + * @param m + * The packet mbuf. + * @param len + * The amount of data to remove (in bytes). + * @return + * - 0: On success. + * - -1: On error. + */ +static inline int rte_pktmbuf_trim(struct rte_mbuf *m, uint16_t len) +{ + struct rte_mbuf *m_last; + + __rte_mbuf_sanity_check(m, 1); + + m_last = rte_pktmbuf_lastseg(m); + if (unlikely(len > m_last->data_len)) + return -1; + + m_last->data_len = (uint16_t)(m_last->data_len - len); + m->pkt_len = (m->pkt_len - len); + return 0; +} + +/** + * Test if mbuf data is contiguous. + * + * @param m + * The packet mbuf. + * @return + * - 1, if all data is contiguous (one segment). + * - 0, if there is several segments. + */ +static inline int rte_pktmbuf_is_contiguous(const struct rte_mbuf *m) +{ + __rte_mbuf_sanity_check(m, 1); + return !!(m->nb_segs == 1); +} + +/** + * @internal used by rte_pktmbuf_read(). + */ +const void *__rte_pktmbuf_read(const struct rte_mbuf *m, uint32_t off, + uint32_t len, void *buf); + +/** + * Read len data bytes in a mbuf at specified offset. + * + * If the data is contiguous, return the pointer in the mbuf data, else + * copy the data in the buffer provided by the user and return its + * pointer. + * + * @param m + * The pointer to the mbuf. + * @param off + * The offset of the data in the mbuf. + * @param len + * The amount of bytes to read. + * @param buf + * The buffer where data is copied if it is not contiguous in mbuf + * data. Its length should be at least equal to the len parameter. + * @return + * The pointer to the data, either in the mbuf if it is contiguous, + * or in the user buffer. If mbuf is too small, NULL is returned. + */ +static inline const void *rte_pktmbuf_read(const struct rte_mbuf *m, + uint32_t off, uint32_t len, void *buf) +{ + if (likely(off + len <= rte_pktmbuf_data_len(m))) + return rte_pktmbuf_mtod_offset(m, char *, off); + else + return __rte_pktmbuf_read(m, off, len, buf); +} + +/** + * Chain an mbuf to another, thereby creating a segmented packet. + * + * Note: The implementation will do a linear walk over the segments to find + * the tail entry. For cases when there are many segments, it's better to + * chain the entries manually. + * + * @param head + * The head of the mbuf chain (the first packet) + * @param tail + * The mbuf to put last in the chain + * + * @return + * - 0, on success. + * - -EOVERFLOW, if the chain segment limit exceeded + */ +static inline int rte_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *tail) +{ + struct rte_mbuf *cur_tail; + + /* Check for number-of-segments-overflow */ + if (head->nb_segs + tail->nb_segs > RTE_MBUF_MAX_NB_SEGS) + return -EOVERFLOW; + + /* Chain 'tail' onto the old tail */ + cur_tail = rte_pktmbuf_lastseg(head); + cur_tail->next = tail; + + /* accumulate number of segments and total length. + * NB: elaborating the addition like this instead of using + * -= allows us to ensure the result type is uint16_t + * avoiding compiler warnings on gcc 8.1 at least */ + head->nb_segs = (uint16_t)(head->nb_segs + tail->nb_segs); + head->pkt_len += tail->pkt_len; + + /* pkt_len is only set in the head */ + tail->pkt_len = tail->data_len; + + return 0; +} + +/** + * Validate general requirements for Tx offload in mbuf. + * + * This function checks correctness and completeness of Tx offload settings. + * + * @param m + * The packet mbuf to be validated. + * @return + * 0 if packet is valid + */ +static inline int +rte_validate_tx_offload(const struct rte_mbuf *m) +{ + uint64_t ol_flags = m->ol_flags; + uint64_t inner_l3_offset = m->l2_len; + + /* Does packet set any of available offloads? */ + if (!(ol_flags & PKT_TX_OFFLOAD_MASK)) + return 0; + + if (ol_flags & PKT_TX_OUTER_IP_CKSUM) + /* NB: elaborating the addition like this instead of using + * += gives the result uint64_t type instead of int, + * avoiding compiler warnings on gcc 8.1 at least */ + inner_l3_offset = inner_l3_offset + m->outer_l2_len + + m->outer_l3_len; + + /* Headers are fragmented */ + if (rte_pktmbuf_data_len(m) < inner_l3_offset + m->l3_len + m->l4_len) + return -ENOTSUP; + + /* IP checksum can be counted only for IPv4 packet */ + if ((ol_flags & PKT_TX_IP_CKSUM) && (ol_flags & PKT_TX_IPV6)) + return -EINVAL; + + /* IP type not set when required */ + if (ol_flags & (PKT_TX_L4_MASK | PKT_TX_TCP_SEG)) + if (!(ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6))) + return -EINVAL; + + /* Check requirements for TSO packet */ + if (ol_flags & PKT_TX_TCP_SEG) + if ((m->tso_segsz == 0) || + ((ol_flags & PKT_TX_IPV4) && + !(ol_flags & PKT_TX_IP_CKSUM))) + return -EINVAL; + + /* PKT_TX_OUTER_IP_CKSUM set for non outer IPv4 packet. */ + if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) && + !(ol_flags & PKT_TX_OUTER_IPV4)) + return -EINVAL; + + return 0; +} + +/** + * Linearize data in mbuf. + * + * This function moves the mbuf data in the first segment if there is enough + * tailroom. The subsequent segments are unchained and freed. + * + * @param mbuf + * mbuf to linearize + * @return + * - 0, on success + * - -1, on error + */ +static inline int +rte_pktmbuf_linearize(struct rte_mbuf *mbuf) +{ + size_t seg_len, copy_len; + struct rte_mbuf *m; + struct rte_mbuf *m_next; + char *buffer; + + if (rte_pktmbuf_is_contiguous(mbuf)) + return 0; + + /* Extend first segment to the total packet length */ + copy_len = rte_pktmbuf_pkt_len(mbuf) - rte_pktmbuf_data_len(mbuf); + + if (unlikely(copy_len > rte_pktmbuf_tailroom(mbuf))) + return -1; + + buffer = rte_pktmbuf_mtod_offset(mbuf, char *, mbuf->data_len); + mbuf->data_len = (uint16_t)(mbuf->pkt_len); + + /* Append data from next segments to the first one */ + m = mbuf->next; + while (m != NULL) { + m_next = m->next; + + seg_len = rte_pktmbuf_data_len(m); + rte_memcpy(buffer, rte_pktmbuf_mtod(m, char *), seg_len); + buffer += seg_len; + + rte_pktmbuf_free_seg(m); + m = m_next; + } + + mbuf->next = NULL; + mbuf->nb_segs = 1; + + return 0; +} + +/** + * Dump an mbuf structure to a file. + * + * Dump all fields for the given packet mbuf and all its associated + * segments (in the case of a chained buffer). + * + * @param f + * A pointer to a file for output + * @param m + * The packet mbuf. + * @param dump_len + * If dump_len != 0, also dump the "dump_len" first data bytes of + * the packet. + */ +void rte_pktmbuf_dump(FILE *f, const struct rte_mbuf *m, unsigned dump_len); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MBUF_H_ */ diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.c b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.c new file mode 100644 index 00000000..5722976f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 NXP + */ + +#include +#include +#include +#include +#include +#include + +int +rte_mbuf_set_platform_mempool_ops(const char *ops_name) +{ + const struct rte_memzone *mz; + + if (strlen(ops_name) >= RTE_MEMPOOL_OPS_NAMESIZE) + return -ENAMETOOLONG; + + mz = rte_memzone_lookup("mbuf_platform_pool_ops"); + if (mz == NULL) { + mz = rte_memzone_reserve("mbuf_platform_pool_ops", + RTE_MEMPOOL_OPS_NAMESIZE, SOCKET_ID_ANY, 0); + if (mz == NULL) + return -rte_errno; + strcpy(mz->addr, ops_name); + return 0; + } else if (strcmp(mz->addr, ops_name) == 0) { + return 0; + } + + RTE_LOG(ERR, MBUF, + "%s is already registered as platform mbuf pool ops\n", + (char *)mz->addr); + return -EEXIST; +} + +const char * +rte_mbuf_platform_mempool_ops(void) +{ + const struct rte_memzone *mz; + + mz = rte_memzone_lookup("mbuf_platform_pool_ops"); + if (mz == NULL) + return NULL; + return mz->addr; +} + +int +rte_mbuf_set_user_mempool_ops(const char *ops_name) +{ + const struct rte_memzone *mz; + + if (strlen(ops_name) >= RTE_MEMPOOL_OPS_NAMESIZE) + return -ENAMETOOLONG; + + mz = rte_memzone_lookup("mbuf_user_pool_ops"); + if (mz == NULL) { + mz = rte_memzone_reserve("mbuf_user_pool_ops", + RTE_MEMPOOL_OPS_NAMESIZE, SOCKET_ID_ANY, 0); + if (mz == NULL) + return -rte_errno; + } + + strcpy(mz->addr, ops_name); + return 0; + +} + +const char * +rte_mbuf_user_mempool_ops(void) +{ + const struct rte_memzone *mz; + + mz = rte_memzone_lookup("mbuf_user_pool_ops"); + if (mz == NULL) + return rte_eal_mbuf_user_pool_ops(); + return mz->addr; +} + +/* Return mbuf pool ops name */ +const char * +rte_mbuf_best_mempool_ops(void) +{ + /* User defined mempool ops takes the priority */ + const char *best_ops = rte_mbuf_user_mempool_ops(); + if (best_ops) + return best_ops; + + /* Next choice is platform configured mempool ops */ + best_ops = rte_mbuf_platform_mempool_ops(); + if (best_ops) + return best_ops; + + /* Last choice is to use the compile time config pool */ + return RTE_MBUF_DEFAULT_MEMPOOL_OPS; +} diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.h b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.h new file mode 100644 index 00000000..7ed95a49 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_pool_ops.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 NXP + */ + +#ifndef _RTE_MBUF_POOL_OPS_H_ +#define _RTE_MBUF_POOL_OPS_H_ + +/** + * @file + * RTE Mbuf Pool Ops + * + * These APIs are for configuring the mbuf pool ops names to be largely used by + * rte_pktmbuf_pool_create(). However, this can also be used to set and inquire + * the best mempool ops available. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the platform supported pktmbuf HW mempool ops name + * + * This function allow the HW to register the actively supported HW mempool + * ops_name. Only one HW mempool ops can be registered at any point of time. + * + * @param ops_name + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_mbuf_set_platform_mempool_ops(const char *ops_name); + +/** + * Get configured platform supported pktmbuf HW mempool ops name + * + * This function returns the platform supported mempool ops name. + * + * @return + * - On success, platform pool ops name. + * - On failure, NULL. + */ +const char * +rte_mbuf_platform_mempool_ops(void); + +/** + * Set the user preferred pktmbuf mempool ops name + * + * This function can be used by the user to configure user preferred + * mempool ops name. + * + * @param ops_name + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_mbuf_set_user_mempool_ops(const char *ops_name); + +/** + * Get user preferred pool ops name for mbuf + * + * This function returns the user configured mempool ops name. + * + * @return + * - On success, user pool ops name.. + * - On failure, NULL. + */ +const char * +rte_mbuf_user_mempool_ops(void); + +/** + * Get the best mempool ops name for pktmbuf. + * + * This function is used to determine the best options for mempool ops for + * pktmbuf allocations. Following are the priority order: + * 1. User defined, 2. Platform HW supported, 3. Compile time configured. + * This function is also used by the rte_pktmbuf_pool_create to get the best + * mempool ops name. + * + * @return + * returns preferred mbuf pool ops name + */ +const char * +rte_mbuf_best_mempool_ops(void); + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MBUF_POOL_OPS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.c b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.c new file mode 100644 index 00000000..d7835e28 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.c @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + */ + +#include + +#include +#include + +/* get the name of the l2 packet type */ +const char *rte_get_ptype_l2_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_L2_MASK) { + case RTE_PTYPE_L2_ETHER: return "L2_ETHER"; + case RTE_PTYPE_L2_ETHER_TIMESYNC: return "L2_ETHER_TIMESYNC"; + case RTE_PTYPE_L2_ETHER_ARP: return "L2_ETHER_ARP"; + case RTE_PTYPE_L2_ETHER_LLDP: return "L2_ETHER_LLDP"; + case RTE_PTYPE_L2_ETHER_NSH: return "L2_ETHER_NSH"; + case RTE_PTYPE_L2_ETHER_VLAN: return "L2_ETHER_VLAN"; + case RTE_PTYPE_L2_ETHER_QINQ: return "L2_ETHER_QINQ"; + case RTE_PTYPE_L2_ETHER_PPPOE: return "L2_ETHER_PPPOE"; + default: return "L2_UNKNOWN"; + } +} + +/* get the name of the l3 packet type */ +const char *rte_get_ptype_l3_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_L3_MASK) { + case RTE_PTYPE_L3_IPV4: return "L3_IPV4"; + case RTE_PTYPE_L3_IPV4_EXT: return "L3_IPV4_EXT"; + case RTE_PTYPE_L3_IPV6: return "L3_IPV6"; + case RTE_PTYPE_L3_IPV4_EXT_UNKNOWN: return "L3_IPV4_EXT_UNKNOWN"; + case RTE_PTYPE_L3_IPV6_EXT: return "L3_IPV6_EXT"; + case RTE_PTYPE_L3_IPV6_EXT_UNKNOWN: return "L3_IPV6_EXT_UNKNOWN"; + default: return "L3_UNKNOWN"; + } +} + +/* get the name of the l4 packet type */ +const char *rte_get_ptype_l4_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_L4_MASK) { + case RTE_PTYPE_L4_TCP: return "L4_TCP"; + case RTE_PTYPE_L4_UDP: return "L4_UDP"; + case RTE_PTYPE_L4_FRAG: return "L4_FRAG"; + case RTE_PTYPE_L4_SCTP: return "L4_SCTP"; + case RTE_PTYPE_L4_ICMP: return "L4_ICMP"; + case RTE_PTYPE_L4_NONFRAG: return "L4_NONFRAG"; + default: return "L4_UNKNOWN"; + } +} + +/* get the name of the tunnel packet type */ +const char *rte_get_ptype_tunnel_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_TUNNEL_MASK) { + case RTE_PTYPE_TUNNEL_IP: return "TUNNEL_IP"; + case RTE_PTYPE_TUNNEL_GRE: return "TUNNEL_GRE"; + case RTE_PTYPE_TUNNEL_VXLAN: return "TUNNEL_VXLAN"; + case RTE_PTYPE_TUNNEL_NVGRE: return "TUNNEL_NVGRE"; + case RTE_PTYPE_TUNNEL_GENEVE: return "TUNNEL_GENEVE"; + case RTE_PTYPE_TUNNEL_GRENAT: return "TUNNEL_GRENAT"; + case RTE_PTYPE_TUNNEL_GTPC: return "TUNNEL_GTPC"; + case RTE_PTYPE_TUNNEL_GTPU: return "TUNNEL_GTPU"; + case RTE_PTYPE_TUNNEL_ESP: return "TUNNEL_ESP"; + case RTE_PTYPE_TUNNEL_L2TP: return "TUNNEL_L2TP"; + case RTE_PTYPE_TUNNEL_VXLAN_GPE: return "TUNNEL_VXLAN_GPE"; + case RTE_PTYPE_TUNNEL_MPLS_IN_UDP: return "TUNNEL_MPLS_IN_UDP"; + case RTE_PTYPE_TUNNEL_MPLS_IN_GRE: return "TUNNEL_MPLS_IN_GRE"; + default: return "TUNNEL_UNKNOWN"; + } +} + +/* get the name of the inner_l2 packet type */ +const char *rte_get_ptype_inner_l2_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_INNER_L2_MASK) { + case RTE_PTYPE_INNER_L2_ETHER: return "INNER_L2_ETHER"; + case RTE_PTYPE_INNER_L2_ETHER_VLAN: return "INNER_L2_ETHER_VLAN"; + case RTE_PTYPE_INNER_L2_ETHER_QINQ: return "INNER_L2_ETHER_QINQ"; + default: return "INNER_L2_UNKNOWN"; + } +} + +/* get the name of the inner_l3 packet type */ +const char *rte_get_ptype_inner_l3_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_INNER_L3_MASK) { + case RTE_PTYPE_INNER_L3_IPV4: return "INNER_L3_IPV4"; + case RTE_PTYPE_INNER_L3_IPV4_EXT: return "INNER_L3_IPV4_EXT"; + case RTE_PTYPE_INNER_L3_IPV6: return "INNER_L3_IPV6"; + case RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN: + return "INNER_L3_IPV4_EXT_UNKNOWN"; + case RTE_PTYPE_INNER_L3_IPV6_EXT: return "INNER_L3_IPV6_EXT"; + case RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN: + return "INNER_L3_IPV6_EXT_UNKNOWN"; + default: return "INNER_L3_UNKNOWN"; + } +} + +/* get the name of the inner_l4 packet type */ +const char *rte_get_ptype_inner_l4_name(uint32_t ptype) +{ + switch (ptype & RTE_PTYPE_INNER_L4_MASK) { + case RTE_PTYPE_INNER_L4_TCP: return "INNER_L4_TCP"; + case RTE_PTYPE_INNER_L4_UDP: return "INNER_L4_UDP"; + case RTE_PTYPE_INNER_L4_FRAG: return "INNER_L4_FRAG"; + case RTE_PTYPE_INNER_L4_SCTP: return "INNER_L4_SCTP"; + case RTE_PTYPE_INNER_L4_ICMP: return "INNER_L4_ICMP"; + case RTE_PTYPE_INNER_L4_NONFRAG: return "INNER_L4_NONFRAG"; + default: return "INNER_L4_UNKNOWN"; + } +} + +/* write the packet type name into the buffer */ +int rte_get_ptype_name(uint32_t ptype, char *buf, size_t buflen) +{ + int ret; + + if (buflen == 0) + return -1; + + buf[0] = '\0'; + if ((ptype & RTE_PTYPE_ALL_MASK) == RTE_PTYPE_UNKNOWN) { + ret = snprintf(buf, buflen, "UNKNOWN"); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + return 0; + } + + if ((ptype & RTE_PTYPE_L2_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_l2_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + if ((ptype & RTE_PTYPE_L3_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_l3_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + if ((ptype & RTE_PTYPE_L4_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_l4_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + if ((ptype & RTE_PTYPE_TUNNEL_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_tunnel_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + if ((ptype & RTE_PTYPE_INNER_L2_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_inner_l2_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + if ((ptype & RTE_PTYPE_INNER_L3_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_inner_l3_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + if ((ptype & RTE_PTYPE_INNER_L4_MASK) != 0) { + ret = snprintf(buf, buflen, "%s ", + rte_get_ptype_inner_l4_name(ptype)); + if (ret < 0) + return -1; + if ((size_t)ret >= buflen) + return -1; + buf += ret; + buflen -= ret; + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.h b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.h new file mode 100644 index 00000000..01acc66e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_ptype.h @@ -0,0 +1,758 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation. + * Copyright 2014-2016 6WIND S.A. + */ + +#ifndef _RTE_MBUF_PTYPE_H_ +#define _RTE_MBUF_PTYPE_H_ + +/** + * @file + * RTE Mbuf Packet Types + * + * This file contains declarations for features related to mbuf packet + * types. The packet type gives information about the data carried by the + * mbuf, and is stored in the mbuf in a 32 bits field. + * + * The 32 bits are divided into several fields to mark packet types. Note that + * each field is indexical. + * - Bit 3:0 is for L2 types. + * - Bit 7:4 is for L3 or outer L3 (for tunneling case) types. + * - Bit 11:8 is for L4 or outer L4 (for tunneling case) types. + * - Bit 15:12 is for tunnel types. + * - Bit 19:16 is for inner L2 types. + * - Bit 23:20 is for inner L3 types. + * - Bit 27:24 is for inner L4 types. + * - Bit 31:28 is reserved. + * + * To be compatible with Vector PMD, RTE_PTYPE_L3_IPV4, RTE_PTYPE_L3_IPV4_EXT, + * RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L4_TCP, RTE_PTYPE_L4_UDP + * and RTE_PTYPE_L4_SCTP should be kept as below in a contiguous 7 bits. + * + * Note that L3 types values are selected for checking IPV4/IPV6 header from + * performance point of view. Reading annotations of RTE_ETH_IS_IPV4_HDR and + * RTE_ETH_IS_IPV6_HDR is needed for any future changes of L3 type values. + * + * Note that the packet types of the same packet recognized by different + * hardware may be different, as different hardware may have different + * capability of packet type recognition. + * + * examples: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=0x29 + * | 'version'=6, 'next header'=0x3A + * | 'ICMPv6 header'> + * will be recognized on i40e hardware as packet type combination of, + * RTE_PTYPE_L2_ETHER | + * RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + * RTE_PTYPE_TUNNEL_IP | + * RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + * RTE_PTYPE_INNER_L4_ICMP. + * + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=0x2F + * | 'GRE header' + * | 'version'=6, 'next header'=0x11 + * | 'UDP header'> + * will be recognized on i40e hardware as packet type combination of, + * RTE_PTYPE_L2_ETHER | + * RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + * RTE_PTYPE_TUNNEL_GRENAT | + * RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + * RTE_PTYPE_INNER_L4_UDP. + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * No packet type information. + */ +#define RTE_PTYPE_UNKNOWN 0x00000000 +/** + * Ethernet packet type. + * It is used for outer packet for tunneling cases. + * + * Packet format: + * <'ether type'=[0x0800|0x86DD]> + */ +#define RTE_PTYPE_L2_ETHER 0x00000001 +/** + * Ethernet packet type for time sync. + * + * Packet format: + * <'ether type'=0x88F7> + */ +#define RTE_PTYPE_L2_ETHER_TIMESYNC 0x00000002 +/** + * ARP (Address Resolution Protocol) packet type. + * + * Packet format: + * <'ether type'=0x0806> + */ +#define RTE_PTYPE_L2_ETHER_ARP 0x00000003 +/** + * LLDP (Link Layer Discovery Protocol) packet type. + * + * Packet format: + * <'ether type'=0x88CC> + */ +#define RTE_PTYPE_L2_ETHER_LLDP 0x00000004 +/** + * NSH (Network Service Header) packet type. + * + * Packet format: + * <'ether type'=0x894F> + */ +#define RTE_PTYPE_L2_ETHER_NSH 0x00000005 +/** + * VLAN packet type. + * + * Packet format: + * <'ether type'=[0x8100]> + */ +#define RTE_PTYPE_L2_ETHER_VLAN 0x00000006 +/** + * QinQ packet type. + * + * Packet format: + * <'ether type'=[0x88A8]> + */ +#define RTE_PTYPE_L2_ETHER_QINQ 0x00000007 +/** + * PPPOE packet type. + * + * Packet format: + * <'ether type'=[0x8863|0x8864]> + */ +#define RTE_PTYPE_L2_ETHER_PPPOE 0x00000008 +/** + * Mask of layer 2 packet types. + * It is used for outer packet for tunneling cases. + */ +#define RTE_PTYPE_L2_MASK 0x0000000f +/** + * IP (Internet Protocol) version 4 packet type. + * It is used for outer packet for tunneling cases, and does not contain any + * header option. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'ihl'=5> + */ +#define RTE_PTYPE_L3_IPV4 0x00000010 +/** + * IP (Internet Protocol) version 4 packet type. + * It is used for outer packet for tunneling cases, and contains header + * options. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'ihl'=[6-15], 'options'> + */ +#define RTE_PTYPE_L3_IPV4_EXT 0x00000030 +/** + * IP (Internet Protocol) version 6 packet type. + * It is used for outer packet for tunneling cases, and does not contain any + * extension header. + * + * Packet format: + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=0x3B> + */ +#define RTE_PTYPE_L3_IPV6 0x00000040 +/** + * IP (Internet Protocol) version 4 packet type. + * It is used for outer packet for tunneling cases, and may or maynot contain + * header options. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'ihl'=[5-15], <'options'>> + */ +#define RTE_PTYPE_L3_IPV4_EXT_UNKNOWN 0x00000090 +/** + * IP (Internet Protocol) version 6 packet type. + * It is used for outer packet for tunneling cases, and contains extension + * headers. + * + * Packet format: + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=[0x0|0x2B|0x2C|0x32|0x33|0x3C|0x87], + * 'extension headers'> + */ +#define RTE_PTYPE_L3_IPV6_EXT 0x000000c0 +/** + * IP (Internet Protocol) version 6 packet type. + * It is used for outer packet for tunneling cases, and may or maynot contain + * extension headers. + * + * Packet format: + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=[0x3B|0x0|0x2B|0x2C|0x32|0x33|0x3C|0x87], + * <'extension headers'>> + */ +#define RTE_PTYPE_L3_IPV6_EXT_UNKNOWN 0x000000e0 +/** + * Mask of layer 3 packet types. + * It is used for outer packet for tunneling cases. + */ +#define RTE_PTYPE_L3_MASK 0x000000f0 +/** + * TCP (Transmission Control Protocol) packet type. + * It is used for outer packet for tunneling cases. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=6, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=6> + */ +#define RTE_PTYPE_L4_TCP 0x00000100 +/** + * UDP (User Datagram Protocol) packet type. + * It is used for outer packet for tunneling cases. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17> + */ +#define RTE_PTYPE_L4_UDP 0x00000200 +/** + * Fragmented IP (Internet Protocol) packet type. + * It is used for outer packet for tunneling cases. + * + * It refers to those packets of any IP types, which can be recognized as + * fragmented. A fragmented packet cannot be recognized as any other L4 types + * (RTE_PTYPE_L4_TCP, RTE_PTYPE_L4_UDP, RTE_PTYPE_L4_SCTP, RTE_PTYPE_L4_ICMP, + * RTE_PTYPE_L4_NONFRAG). + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'MF'=1> + * or, + * <'ether type'=0x0800 + * | 'version'=4, 'frag_offset'!=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=44> + */ +#define RTE_PTYPE_L4_FRAG 0x00000300 +/** + * SCTP (Stream Control Transmission Protocol) packet type. + * It is used for outer packet for tunneling cases. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=132, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=132> + */ +#define RTE_PTYPE_L4_SCTP 0x00000400 +/** + * ICMP (Internet Control Message Protocol) packet type. + * It is used for outer packet for tunneling cases. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=1, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=1> + */ +#define RTE_PTYPE_L4_ICMP 0x00000500 +/** + * Non-fragmented IP (Internet Protocol) packet type. + * It is used for outer packet for tunneling cases. + * + * It refers to those packets of any IP types, while cannot be recognized as + * any of above L4 types (RTE_PTYPE_L4_TCP, RTE_PTYPE_L4_UDP, + * RTE_PTYPE_L4_FRAG, RTE_PTYPE_L4_SCTP, RTE_PTYPE_L4_ICMP). + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'!=[6|17|132|1], 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'!=[6|17|44|132|1]> + */ +#define RTE_PTYPE_L4_NONFRAG 0x00000600 +/** + * Mask of layer 4 packet types. + * It is used for outer packet for tunneling cases. + */ +#define RTE_PTYPE_L4_MASK 0x00000f00 +/** + * IP (Internet Protocol) in IP (Internet Protocol) tunneling packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=[4|41]> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=[4|41]> + */ +#define RTE_PTYPE_TUNNEL_IP 0x00001000 +/** + * GRE (Generic Routing Encapsulation) tunneling packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=47> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=47> + */ +#define RTE_PTYPE_TUNNEL_GRE 0x00002000 +/** + * VXLAN (Virtual eXtensible Local Area Network) tunneling packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'destination port'=4789> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=4789> + */ +#define RTE_PTYPE_TUNNEL_VXLAN 0x00003000 +/** + * NVGRE (Network Virtualization using Generic Routing Encapsulation) tunneling + * packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=47 + * | 'protocol type'=0x6558> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=47 + * | 'protocol type'=0x6558'> + */ +#define RTE_PTYPE_TUNNEL_NVGRE 0x00004000 +/** + * GENEVE (Generic Network Virtualization Encapsulation) tunneling packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'destination port'=6081> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=6081> + */ +#define RTE_PTYPE_TUNNEL_GENEVE 0x00005000 +/** + * Tunneling packet type of Teredo, VXLAN (Virtual eXtensible Local Area + * Network) or GRE (Generic Routing Encapsulation) could be recognized as this + * packet type, if they can not be recognized independently as of hardware + * capability. + */ +#define RTE_PTYPE_TUNNEL_GRENAT 0x00006000 +/** + * GTP-C (GPRS Tunnelling Protocol) control tunneling packet type. + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'destination port'=2123> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=2123> + * or, + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'source port'=2123> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'source port'=2123> + */ +#define RTE_PTYPE_TUNNEL_GTPC 0x00007000 +/** + * GTP-U (GPRS Tunnelling Protocol) user data tunneling packet type. + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'destination port'=2152> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=2152> + */ +#define RTE_PTYPE_TUNNEL_GTPU 0x00008000 +/** + * ESP (IP Encapsulating Security Payload) tunneling packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=51> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=51> + */ +#define RTE_PTYPE_TUNNEL_ESP 0x00009000 +/** + * L2TP (Layer 2 Tunneling Protocol) tunnleing packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17> + * | 'destination port'=1701> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=1701> + * or, + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=115> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'protocol'=115> + */ +#define RTE_PTYPE_TUNNEL_L2TP 0x0000a000 +/** + * VXLAN-GPE (VXLAN Generic Protocol Extension) tunneling packet type. + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'destination port'=4790> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=4790> + */ +#define RTE_PTYPE_TUNNEL_VXLAN_GPE 0x0000b000 +/** + * MPLS-in-GRE tunneling packet type (RFC 4023). + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=47 + * | 'protocol'=0x8847> + * or, + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=47 + * | 'protocol'=0x8848> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'protocol'=47 + * | 'protocol'=0x8847> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=47 + * | 'protocol'=0x8848> + */ +#define RTE_PTYPE_TUNNEL_MPLS_IN_GRE 0x0000c000 +/** + * MPLS-in-UDP tunneling packet type (RFC 7510). + * + * Packet format: + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17 + * | 'destination port'=6635> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17 + * | 'destination port'=6635> + */ +#define RTE_PTYPE_TUNNEL_MPLS_IN_UDP 0x0000d000 +/** + * Mask of tunneling packet types. + */ +#define RTE_PTYPE_TUNNEL_MASK 0x0000f000 +/** + * Ethernet packet type. + * It is used for inner packet type only. + * + * Packet format (inner only): + * <'ether type'=[0x800|0x86DD]> + */ +#define RTE_PTYPE_INNER_L2_ETHER 0x00010000 +/** + * Ethernet packet type with VLAN (Virtual Local Area Network) tag. + * + * Packet format (inner only): + * <'ether type'=[0x800|0x86DD], vlan=[1-4095]> + */ +#define RTE_PTYPE_INNER_L2_ETHER_VLAN 0x00020000 +/** + * QinQ packet type. + * + * Packet format: + * <'ether type'=[0x88A8]> + */ +#define RTE_PTYPE_INNER_L2_ETHER_QINQ 0x00030000 +/** + * Mask of inner layer 2 packet types. + */ +#define RTE_PTYPE_INNER_L2_MASK 0x000f0000 +/** + * IP (Internet Protocol) version 4 packet type. + * It is used for inner packet only, and does not contain any header option. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'ihl'=5> + */ +#define RTE_PTYPE_INNER_L3_IPV4 0x00100000 +/** + * IP (Internet Protocol) version 4 packet type. + * It is used for inner packet only, and contains header options. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'ihl'=[6-15], 'options'> + */ +#define RTE_PTYPE_INNER_L3_IPV4_EXT 0x00200000 +/** + * IP (Internet Protocol) version 6 packet type. + * It is used for inner packet only, and does not contain any extension header. + * + * Packet format (inner only): + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=0x3B> + */ +#define RTE_PTYPE_INNER_L3_IPV6 0x00300000 +/** + * IP (Internet Protocol) version 4 packet type. + * It is used for inner packet only, and may or maynot contain header options. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'ihl'=[5-15], <'options'>> + */ +#define RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN 0x00400000 +/** + * IP (Internet Protocol) version 6 packet type. + * It is used for inner packet only, and contains extension headers. + * + * Packet format (inner only): + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=[0x0|0x2B|0x2C|0x32|0x33|0x3C|0x87], + * 'extension headers'> + */ +#define RTE_PTYPE_INNER_L3_IPV6_EXT 0x00500000 +/** + * IP (Internet Protocol) version 6 packet type. + * It is used for inner packet only, and may or maynot contain extension + * headers. + * + * Packet format (inner only): + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=[0x3B|0x0|0x2B|0x2C|0x32|0x33|0x3C|0x87], + * <'extension headers'>> + */ +#define RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN 0x00600000 +/** + * Mask of inner layer 3 packet types. + */ +#define RTE_PTYPE_INNER_L3_MASK 0x00f00000 +/** + * TCP (Transmission Control Protocol) packet type. + * It is used for inner packet only. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=6, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=6> + */ +#define RTE_PTYPE_INNER_L4_TCP 0x01000000 +/** + * UDP (User Datagram Protocol) packet type. + * It is used for inner packet only. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=17, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=17> + */ +#define RTE_PTYPE_INNER_L4_UDP 0x02000000 +/** + * Fragmented IP (Internet Protocol) packet type. + * It is used for inner packet only, and may or maynot have layer 4 packet. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'MF'=1> + * or, + * <'ether type'=0x0800 + * | 'version'=4, 'frag_offset'!=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=44> + */ +#define RTE_PTYPE_INNER_L4_FRAG 0x03000000 +/** + * SCTP (Stream Control Transmission Protocol) packet type. + * It is used for inner packet only. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=132, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=132> + */ +#define RTE_PTYPE_INNER_L4_SCTP 0x04000000 +/** + * ICMP (Internet Control Message Protocol) packet type. + * It is used for inner packet only. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'=1, 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'=1> + */ +#define RTE_PTYPE_INNER_L4_ICMP 0x05000000 +/** + * Non-fragmented IP (Internet Protocol) packet type. + * It is used for inner packet only, and may or maynot have other unknown layer + * 4 packet types. + * + * Packet format (inner only): + * <'ether type'=0x0800 + * | 'version'=4, 'protocol'!=[6|17|132|1], 'MF'=0, 'frag_offset'=0> + * or, + * <'ether type'=0x86DD + * | 'version'=6, 'next header'!=[6|17|44|132|1]> + */ +#define RTE_PTYPE_INNER_L4_NONFRAG 0x06000000 +/** + * Mask of inner layer 4 packet types. + */ +#define RTE_PTYPE_INNER_L4_MASK 0x0f000000 +/** + * All valid layer masks. + */ +#define RTE_PTYPE_ALL_MASK 0x0fffffff + +/** + * Check if the (outer) L3 header is IPv4. To avoid comparing IPv4 types one by + * one, bit 4 is selected to be used for IPv4 only. Then checking bit 4 can + * determine if it is an IPV4 packet. + */ +#define RTE_ETH_IS_IPV4_HDR(ptype) ((ptype) & RTE_PTYPE_L3_IPV4) + +/** + * Check if the (outer) L3 header is IPv6. To avoid comparing IPv6 types one by + * one, bit 6 is selected to be used for IPv6 only. Then checking bit 6 can + * determine if it is an IPV6 packet. + */ +#define RTE_ETH_IS_IPV6_HDR(ptype) ((ptype) & RTE_PTYPE_L3_IPV6) + +/* Check if it is a tunneling packet */ +#define RTE_ETH_IS_TUNNEL_PKT(ptype) ((ptype) & \ + (RTE_PTYPE_TUNNEL_MASK | \ + RTE_PTYPE_INNER_L2_MASK | \ + RTE_PTYPE_INNER_L3_MASK | \ + RTE_PTYPE_INNER_L4_MASK)) + +/** + * Get the name of the l2 packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_l2_name(uint32_t ptype); + +/** + * Get the name of the l3 packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_l3_name(uint32_t ptype); + +/** + * Get the name of the l4 packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_l4_name(uint32_t ptype); + +/** + * Get the name of the tunnel packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_tunnel_name(uint32_t ptype); + +/** + * Get the name of the inner_l2 packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_inner_l2_name(uint32_t ptype); + +/** + * Get the name of the inner_l3 packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_inner_l3_name(uint32_t ptype); + +/** + * Get the name of the inner_l4 packet type + * + * @param ptype + * The packet type value. + * @return + * A non-null string describing the packet type. + */ +const char *rte_get_ptype_inner_l4_name(uint32_t ptype); + +/** + * Write the packet type name into the buffer + * + * @param ptype + * The packet type value. + * @param buf + * The buffer where the string is written. + * @param buflen + * The length of the buffer. + * @return + * - 0 on success + * - (-1) if the buffer is too small + */ +int rte_get_ptype_name(uint32_t ptype, char *buf, size_t buflen); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MBUF_PTYPE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_version.map b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_version.map new file mode 100644 index 00000000..cae68db8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mbuf/rte_mbuf_version.map @@ -0,0 +1,47 @@ +DPDK_2.0 { + global: + + rte_get_rx_ol_flag_name; + rte_get_tx_ol_flag_name; + rte_mbuf_sanity_check; + rte_pktmbuf_dump; + rte_pktmbuf_init; + rte_pktmbuf_pool_init; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_pktmbuf_pool_create; + +} DPDK_2.0; + +DPDK_16.11 { + global: + + __rte_pktmbuf_read; + rte_get_ptype_inner_l2_name; + rte_get_ptype_inner_l3_name; + rte_get_ptype_inner_l4_name; + rte_get_ptype_l2_name; + rte_get_ptype_l3_name; + rte_get_ptype_l4_name; + rte_get_ptype_name; + rte_get_ptype_tunnel_name; + rte_get_rx_ol_flag_list; + rte_get_tx_ol_flag_list; + +} DPDK_2.1; + +DPDK_18.08 { + global: + + rte_mbuf_best_mempool_ops; + rte_mbuf_platform_mempool_ops; + rte_mbuf_set_platform_mempool_ops; + rte_mbuf_set_user_mempool_ops; + rte_mbuf_user_mempool_ops; + rte_pktmbuf_pool_create_by_ops; +} DPDK_16.11; diff --git a/src/spdk/dpdk/lib/librte_member/Makefile b/src/spdk/dpdk/lib/librte_member/Makefile new file mode 100644 index 00000000..e9580c96 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_member.a + +CFLAGS := -I$(SRCDIR) $(CFLAGS) +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -lm +LDLIBS += -lrte_eal -lrte_hash + +EXPORT_MAP := rte_member_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_MEMBER) += rte_member.c rte_member_ht.c rte_member_vbf.c +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_MEMBER)-include := rte_member.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_member/meson.build b/src/spdk/dpdk/lib/librte_member/meson.build new file mode 100644 index 00000000..0e6b34c4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c') +headers = files('rte_member.h') +deps += ['hash'] diff --git a/src/spdk/dpdk/lib/librte_member/rte_member.c b/src/spdk/dpdk/lib/librte_member/rte_member.c new file mode 100644 index 00000000..702c01d3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member.c @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include + +#include +#include +#include +#include +#include + +#include "rte_member.h" +#include "rte_member_ht.h" +#include "rte_member_vbf.h" + +int librte_member_logtype; + +TAILQ_HEAD(rte_member_list, rte_tailq_entry); +static struct rte_tailq_elem rte_member_tailq = { + .name = "RTE_MEMBER", +}; +EAL_REGISTER_TAILQ(rte_member_tailq) + +struct rte_member_setsum * +rte_member_find_existing(const char *name) +{ + struct rte_member_setsum *setsum = NULL; + struct rte_tailq_entry *te; + struct rte_member_list *member_list; + + member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, member_list, next) { + setsum = (struct rte_member_setsum *) te->data; + if (strncmp(name, setsum->name, RTE_MEMBER_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + return setsum; +} + +void +rte_member_free(struct rte_member_setsum *setsum) +{ + struct rte_member_list *member_list; + struct rte_tailq_entry *te; + + if (setsum == NULL) + return; + member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list); + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, member_list, next) { + if (te->data == (void *)setsum) + break; + } + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + TAILQ_REMOVE(member_list, te, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + rte_member_free_ht(setsum); + break; + case RTE_MEMBER_TYPE_VBF: + rte_member_free_vbf(setsum); + break; + default: + break; + } + rte_free(setsum); + rte_free(te); +} + +struct rte_member_setsum * +rte_member_create(const struct rte_member_parameters *params) +{ + struct rte_tailq_entry *te; + struct rte_member_list *member_list; + struct rte_member_setsum *setsum; + int ret; + + if (params == NULL) { + rte_errno = EINVAL; + return NULL; + } + + if (params->key_len == 0 || + params->prim_hash_seed == params->sec_hash_seed) { + rte_errno = EINVAL; + RTE_MEMBER_LOG(ERR, "Create setsummary with " + "invalid parameters\n"); + return NULL; + } + + member_list = RTE_TAILQ_CAST(rte_member_tailq.head, rte_member_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + TAILQ_FOREACH(te, member_list, next) { + setsum = te->data; + if (strncmp(params->name, setsum->name, + RTE_MEMBER_NAMESIZE) == 0) + break; + } + setsum = NULL; + if (te != NULL) { + rte_errno = EEXIST; + te = NULL; + goto error_unlock_exit; + } + te = rte_zmalloc("MEMBER_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_MEMBER_LOG(ERR, "tailq entry allocation failed\n"); + goto error_unlock_exit; + } + + /* Create a new setsum structure */ + setsum = rte_zmalloc_socket(params->name, + sizeof(struct rte_member_setsum), RTE_CACHE_LINE_SIZE, + params->socket_id); + if (setsum == NULL) { + RTE_MEMBER_LOG(ERR, "Create setsummary failed\n"); + goto error_unlock_exit; + } + snprintf(setsum->name, sizeof(setsum->name), "%s", params->name); + setsum->type = params->type; + setsum->socket_id = params->socket_id; + setsum->key_len = params->key_len; + setsum->num_set = params->num_set; + setsum->prim_hash_seed = params->prim_hash_seed; + setsum->sec_hash_seed = params->sec_hash_seed; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + ret = rte_member_create_ht(setsum, params); + break; + case RTE_MEMBER_TYPE_VBF: + ret = rte_member_create_vbf(setsum, params); + break; + default: + goto error_unlock_exit; + } + if (ret < 0) + goto error_unlock_exit; + + RTE_MEMBER_LOG(DEBUG, "Creating a setsummary table with " + "mode %u\n", setsum->type); + + te->data = (void *)setsum; + TAILQ_INSERT_TAIL(member_list, te, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return setsum; + +error_unlock_exit: + rte_free(te); + rte_free(setsum); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return NULL; +} + +int +rte_member_add(const struct rte_member_setsum *setsum, const void *key, + member_set_t set_id) +{ + if (setsum == NULL || key == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + return rte_member_add_ht(setsum, key, set_id); + case RTE_MEMBER_TYPE_VBF: + return rte_member_add_vbf(setsum, key, set_id); + default: + return -EINVAL; + } +} + +int +rte_member_lookup(const struct rte_member_setsum *setsum, const void *key, + member_set_t *set_id) +{ + if (setsum == NULL || key == NULL || set_id == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + return rte_member_lookup_ht(setsum, key, set_id); + case RTE_MEMBER_TYPE_VBF: + return rte_member_lookup_vbf(setsum, key, set_id); + default: + return -EINVAL; + } +} + +int +rte_member_lookup_bulk(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, + member_set_t *set_ids) +{ + if (setsum == NULL || keys == NULL || set_ids == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + return rte_member_lookup_bulk_ht(setsum, keys, num_keys, + set_ids); + case RTE_MEMBER_TYPE_VBF: + return rte_member_lookup_bulk_vbf(setsum, keys, num_keys, + set_ids); + default: + return -EINVAL; + } +} + +int +rte_member_lookup_multi(const struct rte_member_setsum *setsum, const void *key, + uint32_t match_per_key, member_set_t *set_id) +{ + if (setsum == NULL || key == NULL || set_id == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + return rte_member_lookup_multi_ht(setsum, key, match_per_key, + set_id); + case RTE_MEMBER_TYPE_VBF: + return rte_member_lookup_multi_vbf(setsum, key, match_per_key, + set_id); + default: + return -EINVAL; + } +} + +int +rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, + uint32_t max_match_per_key, uint32_t *match_count, + member_set_t *set_ids) +{ + if (setsum == NULL || keys == NULL || set_ids == NULL || + match_count == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + return rte_member_lookup_multi_bulk_ht(setsum, keys, num_keys, + max_match_per_key, match_count, set_ids); + case RTE_MEMBER_TYPE_VBF: + return rte_member_lookup_multi_bulk_vbf(setsum, keys, num_keys, + max_match_per_key, match_count, set_ids); + default: + return -EINVAL; + } +} + +int +rte_member_delete(const struct rte_member_setsum *setsum, const void *key, + member_set_t set_id) +{ + if (setsum == NULL || key == NULL) + return -EINVAL; + + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + return rte_member_delete_ht(setsum, key, set_id); + /* current vBF implementation does not support delete function */ + case RTE_MEMBER_TYPE_VBF: + default: + return -EINVAL; + } +} + +void +rte_member_reset(const struct rte_member_setsum *setsum) +{ + if (setsum == NULL) + return; + switch (setsum->type) { + case RTE_MEMBER_TYPE_HT: + rte_member_reset_ht(setsum); + return; + case RTE_MEMBER_TYPE_VBF: + rte_member_reset_vbf(setsum); + return; + default: + return; + } +} + +RTE_INIT(librte_member_init_log) +{ + librte_member_logtype = rte_log_register("lib.member"); + if (librte_member_logtype >= 0) + rte_log_set_level(librte_member_logtype, RTE_LOG_DEBUG); +} diff --git a/src/spdk/dpdk/lib/librte_member/rte_member.h b/src/spdk/dpdk/lib/librte_member/rte_member.h new file mode 100644 index 00000000..ab2b2321 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member.h @@ -0,0 +1,490 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +/** + * @file + * + * RTE Membership Library + * + * The Membership Library is an extension and generalization of a traditional + * filter (for example Bloom Filter and cuckoo filter) structure that has + * multiple usages in a variety of workloads and applications. The library is + * used to test if a key belongs to certain sets. Two types of such + * "set-summary" structures are implemented: hash-table based (HT) and vector + * bloom filter (vBF). For HT setsummary, two subtypes or modes are available, + * cache and non-cache modes. The table below summarize some properties of + * the different implementations. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +/** + * + */ + +#ifndef _RTE_MEMBER_H_ +#define _RTE_MEMBER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include + +/** The set ID type that stored internally in hash table based set summary. */ +typedef uint16_t member_set_t; +/** Invalid set ID used to mean no match found. */ +#define RTE_MEMBER_NO_MATCH 0 +/** Maximum size of hash table that can be created. */ +#define RTE_MEMBER_ENTRIES_MAX (1 << 30) +/** Maximum number of keys that can be searched as a bulk */ +#define RTE_MEMBER_LOOKUP_BULK_MAX 64 +/** Entry count per bucket in hash table based mode. */ +#define RTE_MEMBER_BUCKET_ENTRIES 16 +/** Maximum number of characters in setsum name. */ +#define RTE_MEMBER_NAMESIZE 32 + +/** @internal Hash function used by membership library. */ +#if defined(RTE_ARCH_X86) || defined(RTE_MACHINE_CPUFLAG_CRC32) +#include +#define MEMBER_HASH_FUNC rte_hash_crc +#else +#include +#define MEMBER_HASH_FUNC rte_jhash +#endif + +extern int librte_member_logtype; + +#define RTE_MEMBER_LOG(level, ...) \ + rte_log(RTE_LOG_ ## level, \ + librte_member_logtype, \ + RTE_FMT("%s(): " RTE_FMT_HEAD(__VA_ARGS__,), \ + __func__, \ + RTE_FMT_TAIL(__VA_ARGS__,))) + +/** @internal setsummary structure. */ +struct rte_member_setsum; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Parameter struct used to create set summary + */ +struct rte_member_parameters; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Define different set summary types + */ +enum rte_member_setsum_type { + RTE_MEMBER_TYPE_HT = 0, /**< Hash table based set summary. */ + RTE_MEMBER_TYPE_VBF, /**< Vector of bloom filters. */ + RTE_MEMBER_NUM_TYPE +}; + +/** @internal compare function for different arch. */ +enum rte_member_sig_compare_function { + RTE_MEMBER_COMPARE_SCALAR = 0, + RTE_MEMBER_COMPARE_AVX2, + RTE_MEMBER_COMPARE_NUM +}; + +/** @internal setsummary structure. */ +struct rte_member_setsum { + enum rte_member_setsum_type type; /* Type of the set summary. */ + uint32_t key_len; /* Length of key. */ + uint32_t prim_hash_seed; /* Primary hash function seed. */ + uint32_t sec_hash_seed; /* Secondary hash function seed. */ + + /* Hash table based. */ + uint32_t bucket_cnt; /* Number of buckets. */ + uint32_t bucket_mask; /* Bit mask to get bucket index. */ + /* For runtime selecting AVX, scalar, etc for signature comparison. */ + enum rte_member_sig_compare_function sig_cmp_fn; + uint8_t cache; /* If it is cache mode for ht based. */ + + /* Vector bloom filter. */ + uint32_t num_set; /* Number of set (bf) in vbf. */ + uint32_t bits; /* Number of bits in each bf. */ + uint32_t bit_mask; /* Bit mask to get bit location in bf. */ + uint32_t num_hashes; /* Number of hash values to index bf. */ + + uint32_t mul_shift; /* vbf internal variable used during bit test. */ + uint32_t div_shift; /* vbf internal variable used during bit test. */ + + void *table; /* This is the handler of hash table or vBF array. */ + + + /* Second cache line should start here. */ + uint32_t socket_id; /* NUMA Socket ID for memory. */ + char name[RTE_MEMBER_NAMESIZE]; /* Name of this set summary. */ +} __rte_cache_aligned; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Parameters used when create the set summary table. Currently user can + * specify two types of setsummary: HT based and vBF. For HT based, user can + * specify cache or non-cache mode. Here is a table to describe some differences + * + */ +struct rte_member_parameters { + const char *name; /**< Name of the hash. */ + + /** + * User to specify the type of the setsummary from one of + * rte_member_setsum_type. + * + * HT based setsummary is implemented like a hash table. User should use + * this type when there are many sets. + * + * vBF setsummary is a vector of bloom filters. It is used when number + * of sets is not big (less than 32 for current implementation). + */ + enum rte_member_setsum_type type; + + /** + * is_cache is only used for HT based setsummary. + * + * If it is HT based setsummary, user to specify the subtype or mode + * of the setsummary. It could be cache, or non-cache mode. + * Set is_cache to be 1 if to use as cache mode. + * + * For cache mode, keys can be evicted out of the HT setsummary. Keys + * with the same signature and map to the same bucket + * will overwrite each other in the setsummary table. + * This mode is useful for the case that the set-summary only + * needs to keep record of the recently inserted keys. Both + * false-negative and false-positive could happen. + * + * For non-cache mode, keys cannot be evicted out of the cache. So for + * this mode the setsummary will become full eventually. Keys with the + * same signature but map to the same bucket will still occupy multiple + * entries. This mode does not give false-negative result. + */ + uint8_t is_cache; + + /** + * For HT setsummary, num_keys equals to the number of entries of the + * table. When the number of keys inserted in the HT setsummary + * approaches this number, eviction could happen. For cache mode, + * keys could be evicted out of the table. For non-cache mode, keys will + * be evicted to other buckets like cuckoo hash. The table will also + * likely to become full before the number of inserted keys equal to the + * total number of entries. + * + * For vBF, num_keys equal to the expected number of keys that will + * be inserted into the vBF. The implementation assumes the keys are + * evenly distributed to each BF in vBF. This is used to calculate the + * number of bits we need for each BF. User does not specify the size of + * each BF directly because the optimal size depends on the num_keys + * and false positive rate. + */ + uint32_t num_keys; + + /** + * The length of key is used for hash calculation. Since key is not + * stored in set-summary, large key does not require more memory space. + */ + uint32_t key_len; + + /** + * num_set is only used for vBF, but not used for HT setsummary. + * + * num_set is equal to the number of BFs in vBF. For current + * implementation, it only supports 1,2,4,8,16,32 BFs in one vBF set + * summary. If other number of sets are needed, for example 5, the user + * should allocate the minimum available value that larger than 5, + * which is 8. + */ + uint32_t num_set; + + /** + * false_positive_rate is only used for vBF, but not used for HT + * setsummary. + * + * For vBF, false_positive_rate is the user-defined false positive rate + * given expected number of inserted keys (num_keys). It is used to + * calculate the total number of bits for each BF, and the number of + * hash values used during lookup and insertion. For details please + * refer to vBF implementation and membership library documentation. + * + * For HT, This parameter is not directly set by users. + * HT setsummary's false positive rate is in the order of: + * false_pos = (1/bucket_count)*(1/2^16), since we use 16-bit signature. + * This is because two keys needs to map to same bucket and same + * signature to have a collision (false positive). bucket_count is equal + * to number of entries (num_keys) divided by entry count per bucket + * (RTE_MEMBER_BUCKET_ENTRIES). Thus, the false_positive_rate is not + * directly set by users for HT mode. + */ + float false_positive_rate; + + /** + * We use two seeds to calculate two independent hashes for each key. + * + * For HT type, one hash is used as signature, and the other is used + * for bucket location. + * For vBF type, these two hashes and their combinations are used as + * hash locations to index the bit array. + */ + uint32_t prim_hash_seed; + + /** + * The secondary seed should be a different value from the primary seed. + */ + uint32_t sec_hash_seed; + + int socket_id; /**< NUMA Socket ID for memory. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Find an existing set-summary and return a pointer to it. + * + * @param name + * Name of the set-summary. + * @return + * Pointer to the set-summary or NULL if object not found + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - value not available for return + */ +struct rte_member_setsum * +rte_member_find_existing(const char *name); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Create set-summary (SS). + * + * @param params + * Parameters to initialize the setsummary. + * @return + * Return the pointer to the setsummary. + * Return value is NULL if the creation failed. + */ +struct rte_member_setsum * +rte_member_create(const struct rte_member_parameters *params); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Lookup key in set-summary (SS). + * Single key lookup and return as soon as the first match found + * + * @param setsum + * Pointer of a setsummary. + * @param key + * Pointer of the key to be looked up. + * @param set_id + * Output the set id matches the key. + * @return + * Return 1 for found a match and 0 for not found a match. + */ +int +rte_member_lookup(const struct rte_member_setsum *setsum, const void *key, + member_set_t *set_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Lookup bulk of keys in set-summary (SS). + * Each key lookup returns as soon as the first match found + * + * @param setsum + * Pointer of a setsummary. + * @param keys + * Pointer of the bulk of keys to be looked up. + * @param num_keys + * Number of keys that will be lookup. + * @param set_ids + * Output set ids for all the keys to this array. + * User should preallocate array that can contain all results, which size is + * the num_keys. + * @return + * The number of keys that found a match. + */ +int +rte_member_lookup_bulk(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, + member_set_t *set_ids); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Lookup a key in set-summary (SS) for multiple matches. + * The key lookup will find all matched entries (multiple match). + * Note that for cache mode of HT, each key can have at most one match. This is + * because keys with same signature that maps to same bucket will overwrite + * each other. So multi-match lookup should be used for vBF and non-cache HT. + * + * @param setsum + * Pointer of a set-summary. + * @param key + * Pointer of the key that to be looked up. + * @param max_match_per_key + * User specified maximum number of matches for each key. The function returns + * as soon as this number of matches found for the key. + * @param set_id + * Output set ids for all the matches of the key. User needs to preallocate + * the array that can contain max_match_per_key number of results. + * @return + * The number of matches that found for the key. + * For cache mode HT set-summary, the number should be at most 1. + */ +int +rte_member_lookup_multi(const struct rte_member_setsum *setsum, + const void *key, uint32_t max_match_per_key, + member_set_t *set_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Lookup a bulk of keys in set-summary (SS) for multiple matches each key. + * Each key lookup will find all matched entries (multiple match). + * Note that for cache mode HT, each key can have at most one match. So + * multi-match function is mainly used for vBF and non-cache mode HT. + * + * @param setsum + * Pointer of a setsummary. + * @param keys + * Pointer of the keys to be looked up. + * @param num_keys + * The number of keys that will be lookup. + * @param max_match_per_key + * The possible maximum number of matches for each key. + * @param match_count + * Output the number of matches for each key in an array. + * @param set_ids + * Return set ids for all the matches of all keys. Users pass in a + * preallocated 2D array with first dimension as key index and second + * dimension as match index. For example set_ids[bulk_size][max_match_per_key] + * @return + * The number of keys that found one or more matches in the set-summary. + */ +int +rte_member_lookup_multi_bulk(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, + uint32_t max_match_per_key, + uint32_t *match_count, + member_set_t *set_ids); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Insert key into set-summary (SS). + * + * @param setsum + * Pointer of a set-summary. + * @param key + * Pointer of the key to be added. + * @param set_id + * The set id associated with the key that needs to be added. Different mode + * supports different set_id ranges. 0 cannot be used as set_id since + * RTE_MEMBER_NO_MATCH by default is set as 0. + * For HT mode, the set_id has range as [1, 0x7FFF], MSB is reserved. + * For vBF mode the set id is limited by the num_set parameter when create + * the set-summary. + * @return + * HT (cache mode) and vBF should never fail unless the set_id is not in the + * valid range. In such case -EINVAL is returned. + * For HT (non-cache mode) it could fail with -ENOSPC error code when table is + * full. + * For success it returns different values for different modes to provide + * extra information for users. + * Return 0 for HT (cache mode) if the add does not cause + * eviction, return 1 otherwise. Return 0 for non-cache mode if success, + * -ENOSPC for full, and 1 if cuckoo eviction happens. + * Always returns 0 for vBF mode. + */ +int +rte_member_add(const struct rte_member_setsum *setsum, const void *key, + member_set_t set_id); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * De-allocate memory used by set-summary. + * + * @param setsum + * Pointer to the set summary. + */ +void +rte_member_free(struct rte_member_setsum *setsum); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Reset the set-summary tables. E.g. reset bits to be 0 in BF, + * reset set_id in each entry to be RTE_MEMBER_NO_MATCH in HT based SS. + * + * @param setsum + * Pointer to the set-summary. + */ +void +rte_member_reset(const struct rte_member_setsum *setsum); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Delete items from the set-summary. Note that vBF does not support deletion + * in current implementation. For vBF, error code of -EINVAL will be returned. + * + * @param setsum + * Pointer to the set-summary. + * @param key + * Pointer of the key to be deleted. + * @param set_id + * For HT mode, we need both key and its corresponding set_id to + * properly delete the key. Without set_id, we may delete other keys with the + * same signature. + * @return + * If no entry found to delete, an error code of -ENOENT could be returned. + */ +int +rte_member_delete(const struct rte_member_setsum *setsum, const void *key, + member_set_t set_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMBER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_member/rte_member_ht.c b/src/spdk/dpdk/lib/librte_member/rte_member_ht.c new file mode 100644 index 00000000..cbcd0d44 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member_ht.c @@ -0,0 +1,557 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include "rte_member.h" +#include "rte_member_ht.h" + +#if defined(RTE_ARCH_X86) +#include "rte_member_x86.h" +#endif + +/* Search bucket for entry with tmp_sig and update set_id */ +static inline int +update_entry_search(uint32_t bucket_id, member_sig_t tmp_sig, + struct member_ht_bucket *buckets, + member_set_t set_id) +{ + uint32_t i; + + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) { + if (buckets[bucket_id].sigs[i] == tmp_sig) { + buckets[bucket_id].sets[i] = set_id; + return 1; + } + } + return 0; +} + +static inline int +search_bucket_single(uint32_t bucket_id, member_sig_t tmp_sig, + struct member_ht_bucket *buckets, + member_set_t *set_id) +{ + uint32_t iter; + + for (iter = 0; iter < RTE_MEMBER_BUCKET_ENTRIES; iter++) { + if (tmp_sig == buckets[bucket_id].sigs[iter] && + buckets[bucket_id].sets[iter] != + RTE_MEMBER_NO_MATCH) { + *set_id = buckets[bucket_id].sets[iter]; + return 1; + } + } + return 0; +} + +static inline void +search_bucket_multi(uint32_t bucket_id, member_sig_t tmp_sig, + struct member_ht_bucket *buckets, + uint32_t *counter, + uint32_t matches_per_key, + member_set_t *set_id) +{ + uint32_t iter; + + for (iter = 0; iter < RTE_MEMBER_BUCKET_ENTRIES; iter++) { + if (tmp_sig == buckets[bucket_id].sigs[iter] && + buckets[bucket_id].sets[iter] != + RTE_MEMBER_NO_MATCH) { + set_id[*counter] = buckets[bucket_id].sets[iter]; + (*counter)++; + if (*counter >= matches_per_key) + return; + } + } +} + +int +rte_member_create_ht(struct rte_member_setsum *ss, + const struct rte_member_parameters *params) +{ + uint32_t i, j; + uint32_t size_bucket_t; + uint32_t num_entries = rte_align32pow2(params->num_keys); + + if ((num_entries > RTE_MEMBER_ENTRIES_MAX) || + !rte_is_power_of_2(RTE_MEMBER_BUCKET_ENTRIES) || + num_entries < RTE_MEMBER_BUCKET_ENTRIES) { + rte_errno = EINVAL; + RTE_MEMBER_LOG(ERR, + "Membership HT create with invalid parameters\n"); + return -EINVAL; + } + + uint32_t num_buckets = num_entries / RTE_MEMBER_BUCKET_ENTRIES; + + size_bucket_t = sizeof(struct member_ht_bucket); + + struct member_ht_bucket *buckets = rte_zmalloc_socket(NULL, + num_buckets * size_bucket_t, + RTE_CACHE_LINE_SIZE, ss->socket_id); + + if (buckets == NULL) { + RTE_MEMBER_LOG(ERR, "memory allocation failed for HT " + "setsummary\n"); + return -ENOMEM; + } + + ss->table = buckets; + ss->bucket_cnt = num_buckets; + ss->bucket_mask = num_buckets - 1; + ss->cache = params->is_cache; + + for (i = 0; i < num_buckets; i++) { + for (j = 0; j < RTE_MEMBER_BUCKET_ENTRIES; j++) + buckets[i].sets[j] = RTE_MEMBER_NO_MATCH; + } +#if defined(RTE_ARCH_X86) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) && + RTE_MEMBER_BUCKET_ENTRIES == 16) + ss->sig_cmp_fn = RTE_MEMBER_COMPARE_AVX2; + else +#endif + ss->sig_cmp_fn = RTE_MEMBER_COMPARE_SCALAR; + + RTE_MEMBER_LOG(DEBUG, "Hash table based filter created, " + "the table has %u entries, %u buckets\n", + num_entries, num_buckets); + return 0; +} + +static inline void +get_buckets_index(const struct rte_member_setsum *ss, const void *key, + uint32_t *prim_bkt, uint32_t *sec_bkt, member_sig_t *sig) +{ + uint32_t first_hash = MEMBER_HASH_FUNC(key, ss->key_len, + ss->prim_hash_seed); + uint32_t sec_hash = MEMBER_HASH_FUNC(&first_hash, sizeof(uint32_t), + ss->sec_hash_seed); + /* + * We use the first hash value for the signature, and the second hash + * value to derive the primary and secondary bucket locations. + * + * For non-cache mode, we use the lower bits for the primary bucket + * location. Then we xor primary bucket location and the signature + * to get the secondary bucket location. This is called "partial-key + * cuckoo hashing" proposed by B. Fan, et al's paper + * "Cuckoo Filter: Practically Better Than Bloom". The benefit to use + * xor is that one could derive the alternative bucket location + * by only using the current bucket location and the signature. This is + * generally required by non-cache mode's eviction and deletion + * process without the need to store alternative hash value nor the full + * key. + * + * For cache mode, we use the lower bits for the primary bucket + * location and the higher bits for the secondary bucket location. In + * cache mode, keys are simply overwritten if bucket is full. We do not + * use xor since lower/higher bits are more independent hash values thus + * should provide slightly better table load. + */ + *sig = first_hash; + if (ss->cache) { + *prim_bkt = sec_hash & ss->bucket_mask; + *sec_bkt = (sec_hash >> 16) & ss->bucket_mask; + } else { + *prim_bkt = sec_hash & ss->bucket_mask; + *sec_bkt = (*prim_bkt ^ *sig) & ss->bucket_mask; + } +} + +int +rte_member_lookup_ht(const struct rte_member_setsum *ss, + const void *key, member_set_t *set_id) +{ + uint32_t prim_bucket, sec_bucket; + member_sig_t tmp_sig; + struct member_ht_bucket *buckets = ss->table; + + *set_id = RTE_MEMBER_NO_MATCH; + get_buckets_index(ss, key, &prim_bucket, &sec_bucket, &tmp_sig); + + switch (ss->sig_cmp_fn) { +#if defined(RTE_ARCH_X86) && defined(RTE_MACHINE_CPUFLAG_AVX2) + case RTE_MEMBER_COMPARE_AVX2: + if (search_bucket_single_avx(prim_bucket, tmp_sig, buckets, + set_id) || + search_bucket_single_avx(sec_bucket, tmp_sig, + buckets, set_id)) + return 1; + break; +#endif + default: + if (search_bucket_single(prim_bucket, tmp_sig, buckets, + set_id) || + search_bucket_single(sec_bucket, tmp_sig, + buckets, set_id)) + return 1; + } + + return 0; +} + +uint32_t +rte_member_lookup_bulk_ht(const struct rte_member_setsum *ss, + const void **keys, uint32_t num_keys, member_set_t *set_id) +{ + uint32_t i; + uint32_t num_matches = 0; + struct member_ht_bucket *buckets = ss->table; + member_sig_t tmp_sig[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t prim_buckets[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t sec_buckets[RTE_MEMBER_LOOKUP_BULK_MAX]; + + for (i = 0; i < num_keys; i++) { + get_buckets_index(ss, keys[i], &prim_buckets[i], + &sec_buckets[i], &tmp_sig[i]); + rte_prefetch0(&buckets[prim_buckets[i]]); + rte_prefetch0(&buckets[sec_buckets[i]]); + } + + for (i = 0; i < num_keys; i++) { + switch (ss->sig_cmp_fn) { +#if defined(RTE_ARCH_X86) && defined(RTE_MACHINE_CPUFLAG_AVX2) + case RTE_MEMBER_COMPARE_AVX2: + if (search_bucket_single_avx(prim_buckets[i], + tmp_sig[i], buckets, &set_id[i]) || + search_bucket_single_avx(sec_buckets[i], + tmp_sig[i], buckets, &set_id[i])) + num_matches++; + else + set_id[i] = RTE_MEMBER_NO_MATCH; + break; +#endif + default: + if (search_bucket_single(prim_buckets[i], tmp_sig[i], + buckets, &set_id[i]) || + search_bucket_single(sec_buckets[i], + tmp_sig[i], buckets, &set_id[i])) + num_matches++; + else + set_id[i] = RTE_MEMBER_NO_MATCH; + } + } + return num_matches; +} + +uint32_t +rte_member_lookup_multi_ht(const struct rte_member_setsum *ss, + const void *key, uint32_t match_per_key, + member_set_t *set_id) +{ + uint32_t num_matches = 0; + uint32_t prim_bucket, sec_bucket; + member_sig_t tmp_sig; + struct member_ht_bucket *buckets = ss->table; + + get_buckets_index(ss, key, &prim_bucket, &sec_bucket, &tmp_sig); + + switch (ss->sig_cmp_fn) { +#if defined(RTE_ARCH_X86) && defined(RTE_MACHINE_CPUFLAG_AVX2) + case RTE_MEMBER_COMPARE_AVX2: + search_bucket_multi_avx(prim_bucket, tmp_sig, buckets, + &num_matches, match_per_key, set_id); + if (num_matches < match_per_key) + search_bucket_multi_avx(sec_bucket, tmp_sig, + buckets, &num_matches, match_per_key, set_id); + return num_matches; +#endif + default: + search_bucket_multi(prim_bucket, tmp_sig, buckets, &num_matches, + match_per_key, set_id); + if (num_matches < match_per_key) + search_bucket_multi(sec_bucket, tmp_sig, + buckets, &num_matches, match_per_key, set_id); + return num_matches; + } +} + +uint32_t +rte_member_lookup_multi_bulk_ht(const struct rte_member_setsum *ss, + const void **keys, uint32_t num_keys, uint32_t match_per_key, + uint32_t *match_count, + member_set_t *set_ids) +{ + uint32_t i; + uint32_t num_matches = 0; + struct member_ht_bucket *buckets = ss->table; + uint32_t match_cnt_tmp; + member_sig_t tmp_sig[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t prim_buckets[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t sec_buckets[RTE_MEMBER_LOOKUP_BULK_MAX]; + + for (i = 0; i < num_keys; i++) { + get_buckets_index(ss, keys[i], &prim_buckets[i], + &sec_buckets[i], &tmp_sig[i]); + rte_prefetch0(&buckets[prim_buckets[i]]); + rte_prefetch0(&buckets[sec_buckets[i]]); + } + for (i = 0; i < num_keys; i++) { + match_cnt_tmp = 0; + + switch (ss->sig_cmp_fn) { +#if defined(RTE_ARCH_X86) && defined(RTE_MACHINE_CPUFLAG_AVX2) + case RTE_MEMBER_COMPARE_AVX2: + search_bucket_multi_avx(prim_buckets[i], tmp_sig[i], + buckets, &match_cnt_tmp, match_per_key, + &set_ids[i*match_per_key]); + if (match_cnt_tmp < match_per_key) + search_bucket_multi_avx(sec_buckets[i], + tmp_sig[i], buckets, &match_cnt_tmp, + match_per_key, + &set_ids[i*match_per_key]); + match_count[i] = match_cnt_tmp; + if (match_cnt_tmp != 0) + num_matches++; + break; +#endif + default: + search_bucket_multi(prim_buckets[i], tmp_sig[i], + buckets, &match_cnt_tmp, match_per_key, + &set_ids[i*match_per_key]); + if (match_cnt_tmp < match_per_key) + search_bucket_multi(sec_buckets[i], tmp_sig[i], + buckets, &match_cnt_tmp, match_per_key, + &set_ids[i*match_per_key]); + match_count[i] = match_cnt_tmp; + if (match_cnt_tmp != 0) + num_matches++; + } + } + return num_matches; +} + +static inline int +try_insert(struct member_ht_bucket *buckets, uint32_t prim, uint32_t sec, + member_sig_t sig, member_set_t set_id) +{ + int i; + /* If not full then insert into one slot */ + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) { + if (buckets[prim].sets[i] == RTE_MEMBER_NO_MATCH) { + buckets[prim].sigs[i] = sig; + buckets[prim].sets[i] = set_id; + return 0; + } + } + /* If prim failed, we need to access second bucket */ + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) { + if (buckets[sec].sets[i] == RTE_MEMBER_NO_MATCH) { + buckets[sec].sigs[i] = sig; + buckets[sec].sets[i] = set_id; + return 0; + } + } + return -1; +} + +static inline int +try_update(struct member_ht_bucket *buckets, uint32_t prim, uint32_t sec, + member_sig_t sig, member_set_t set_id, + enum rte_member_sig_compare_function cmp_fn) +{ + switch (cmp_fn) { +#if defined(RTE_ARCH_X86) && defined(RTE_MACHINE_CPUFLAG_AVX2) + case RTE_MEMBER_COMPARE_AVX2: + if (update_entry_search_avx(prim, sig, buckets, set_id) || + update_entry_search_avx(sec, sig, buckets, + set_id)) + return 0; + break; +#endif + default: + if (update_entry_search(prim, sig, buckets, set_id) || + update_entry_search(sec, sig, buckets, + set_id)) + return 0; + } + return -1; +} + +static inline int +evict_from_bucket(void) +{ + /* For now, we randomly pick one entry to evict */ + return rte_rand() & (RTE_MEMBER_BUCKET_ENTRIES - 1); +} + +/* + * This function is similar to the cuckoo hash make_space function in hash + * library + */ +static inline int +make_space_bucket(const struct rte_member_setsum *ss, uint32_t bkt_idx, + unsigned int *nr_pushes) +{ + unsigned int i, j; + int ret; + struct member_ht_bucket *buckets = ss->table; + uint32_t next_bucket_idx; + struct member_ht_bucket *next_bkt[RTE_MEMBER_BUCKET_ENTRIES]; + struct member_ht_bucket *bkt = &buckets[bkt_idx]; + /* MSB is set to indicate if an entry has been already pushed */ + member_set_t flag_mask = 1U << (sizeof(member_set_t) * 8 - 1); + + /* + * Push existing item (search for bucket with space in + * alternative locations) to its alternative location + */ + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) { + /* Search for space in alternative locations */ + next_bucket_idx = (bkt->sigs[i] ^ bkt_idx) & ss->bucket_mask; + next_bkt[i] = &buckets[next_bucket_idx]; + for (j = 0; j < RTE_MEMBER_BUCKET_ENTRIES; j++) { + if (next_bkt[i]->sets[j] == RTE_MEMBER_NO_MATCH) + break; + } + + if (j != RTE_MEMBER_BUCKET_ENTRIES) + break; + } + + /* Alternative location has spare room (end of recursive function) */ + if (i != RTE_MEMBER_BUCKET_ENTRIES) { + next_bkt[i]->sigs[j] = bkt->sigs[i]; + next_bkt[i]->sets[j] = bkt->sets[i]; + return i; + } + + /* Pick entry that has not been pushed yet */ + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) + if ((bkt->sets[i] & flag_mask) == 0) + break; + + /* All entries have been pushed, so entry cannot be added */ + if (i == RTE_MEMBER_BUCKET_ENTRIES || + ++(*nr_pushes) > RTE_MEMBER_MAX_PUSHES) + return -ENOSPC; + + next_bucket_idx = (bkt->sigs[i] ^ bkt_idx) & ss->bucket_mask; + /* Set flag to indicate that this entry is going to be pushed */ + bkt->sets[i] |= flag_mask; + + /* Need room in alternative bucket to insert the pushed entry */ + ret = make_space_bucket(ss, next_bucket_idx, nr_pushes); + /* + * After recursive function. + * Clear flags and insert the pushed entry + * in its alternative location if successful, + * or return error + */ + bkt->sets[i] &= ~flag_mask; + if (ret >= 0) { + next_bkt[i]->sigs[ret] = bkt->sigs[i]; + next_bkt[i]->sets[ret] = bkt->sets[i]; + return i; + } else + return ret; +} + +int +rte_member_add_ht(const struct rte_member_setsum *ss, + const void *key, member_set_t set_id) +{ + int ret; + unsigned int nr_pushes = 0; + uint32_t prim_bucket, sec_bucket; + member_sig_t tmp_sig; + struct member_ht_bucket *buckets = ss->table; + member_set_t flag_mask = 1U << (sizeof(member_set_t) * 8 - 1); + + if (set_id == RTE_MEMBER_NO_MATCH || (set_id & flag_mask) != 0) + return -EINVAL; + + get_buckets_index(ss, key, &prim_bucket, &sec_bucket, &tmp_sig); + + /* + * If it is cache based setsummary, we try overwriting (updating) + * existing entry with the same signature first. In cache mode, we allow + * false negatives and only cache the most recent keys. + * + * For non-cache mode, we do not update existing entry with the same + * signature. This is because if two keys with same signature update + * each other, false negative may happen, which is not the expected + * behavior for non-cache setsummary. + */ + if (ss->cache) { + ret = try_update(buckets, prim_bucket, sec_bucket, tmp_sig, + set_id, ss->sig_cmp_fn); + if (ret != -1) + return ret; + } + /* If not full then insert into one slot */ + ret = try_insert(buckets, prim_bucket, sec_bucket, tmp_sig, set_id); + if (ret != -1) + return ret; + + /* Random pick prim or sec for recursive displacement */ + uint32_t select_bucket = (tmp_sig && 1U) ? prim_bucket : sec_bucket; + if (ss->cache) { + ret = evict_from_bucket(); + buckets[select_bucket].sigs[ret] = tmp_sig; + buckets[select_bucket].sets[ret] = set_id; + return 1; + } + + ret = make_space_bucket(ss, select_bucket, &nr_pushes); + if (ret >= 0) { + buckets[select_bucket].sigs[ret] = tmp_sig; + buckets[select_bucket].sets[ret] = set_id; + ret = 1; + } + + return ret; +} + +void +rte_member_free_ht(struct rte_member_setsum *ss) +{ + rte_free(ss->table); +} + +int +rte_member_delete_ht(const struct rte_member_setsum *ss, const void *key, + member_set_t set_id) +{ + int i; + uint32_t prim_bucket, sec_bucket; + member_sig_t tmp_sig; + struct member_ht_bucket *buckets = ss->table; + + get_buckets_index(ss, key, &prim_bucket, &sec_bucket, &tmp_sig); + + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) { + if (tmp_sig == buckets[prim_bucket].sigs[i] && + set_id == buckets[prim_bucket].sets[i]) { + buckets[prim_bucket].sets[i] = RTE_MEMBER_NO_MATCH; + return 0; + } + } + + for (i = 0; i < RTE_MEMBER_BUCKET_ENTRIES; i++) { + if (tmp_sig == buckets[sec_bucket].sigs[i] && + set_id == buckets[sec_bucket].sets[i]) { + buckets[sec_bucket].sets[i] = RTE_MEMBER_NO_MATCH; + return 0; + } + } + return -ENOENT; +} + +void +rte_member_reset_ht(const struct rte_member_setsum *ss) +{ + uint32_t i, j; + struct member_ht_bucket *buckets = ss->table; + + for (i = 0; i < ss->bucket_cnt; i++) { + for (j = 0; j < RTE_MEMBER_BUCKET_ENTRIES; j++) + buckets[i].sets[j] = RTE_MEMBER_NO_MATCH; + } +} diff --git a/src/spdk/dpdk/lib/librte_member/rte_member_ht.h b/src/spdk/dpdk/lib/librte_member/rte_member_ht.h new file mode 100644 index 00000000..9e24ccdc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member_ht.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_MEMBER_HT_H_ +#define _RTE_MEMBER_HT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Maximum number of pushes for cuckoo path in HT mode. */ +#define RTE_MEMBER_MAX_PUSHES 50 + +typedef uint16_t member_sig_t; /* signature size is 16 bit */ + +/* The bucket struct for ht setsum */ +struct member_ht_bucket { + member_sig_t sigs[RTE_MEMBER_BUCKET_ENTRIES]; /* 2-byte signature */ + member_set_t sets[RTE_MEMBER_BUCKET_ENTRIES]; /* 2-byte set */ +} __rte_cache_aligned; + +int +rte_member_create_ht(struct rte_member_setsum *ss, + const struct rte_member_parameters *params); + +int +rte_member_lookup_ht(const struct rte_member_setsum *setsum, + const void *key, member_set_t *set_id); + +uint32_t +rte_member_lookup_bulk_ht(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, + member_set_t *set_ids); + +uint32_t +rte_member_lookup_multi_ht(const struct rte_member_setsum *setsum, + const void *key, uint32_t match_per_key, + member_set_t *set_id); + +uint32_t +rte_member_lookup_multi_bulk_ht(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, uint32_t match_per_key, + uint32_t *match_count, + member_set_t *set_ids); + +int +rte_member_add_ht(const struct rte_member_setsum *setsum, + const void *key, member_set_t set_id); + +void +rte_member_free_ht(struct rte_member_setsum *setsum); + +int +rte_member_delete_ht(const struct rte_member_setsum *ss, const void *key, + member_set_t set_id); + +void +rte_member_reset_ht(const struct rte_member_setsum *setsum); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMBER_HT_H_ */ diff --git a/src/spdk/dpdk/lib/librte_member/rte_member_vbf.c b/src/spdk/dpdk/lib/librte_member/rte_member_vbf.c new file mode 100644 index 00000000..8a232bae --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member_vbf.c @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include + +#include "rte_member.h" +#include "rte_member_vbf.h" + +/* + * vBF currently implemented as a big array. + * The BFs have a vertical layout. Bits in same location of all bfs will stay + * in the same cache line. + * For example, if we have 32 bloom filters, we use a uint32_t array to + * represent all of them. array[0] represent the first location of all the + * bloom filters, array[1] represents the second location of all the + * bloom filters, etc. The advantage of this layout is to minimize the average + * number of memory accesses to test all bloom filters. + * + * Currently the implementation supports vBF containing 1,2,4,8,16,32 BFs. + */ +int +rte_member_create_vbf(struct rte_member_setsum *ss, + const struct rte_member_parameters *params) +{ + + if (params->num_set > RTE_MEMBER_MAX_BF || + !rte_is_power_of_2(params->num_set) || + params->num_keys == 0 || + params->false_positive_rate == 0 || + params->false_positive_rate > 1) { + rte_errno = EINVAL; + RTE_MEMBER_LOG(ERR, "Membership vBF create with invalid parameters\n"); + return -EINVAL; + } + + /* We assume expected keys evenly distribute to all BFs */ + uint32_t num_keys_per_bf = 1 + (params->num_keys - 1) / ss->num_set; + + /* + * Note that the false positive rate is for all BFs in the vBF + * such that the single BF's false positive rate needs to be + * calculated. + * Assume each BF's False positive rate is fp_one_bf. The total false + * positive rate is fp = 1-(1-fp_one_bf)^n. + * => fp_one_bf = 1 - (1-fp)^(1/n) + */ + + float fp_one_bf = 1 - pow((1 - params->false_positive_rate), + 1.0 / ss->num_set); + + if (fp_one_bf == 0) { + rte_errno = EINVAL; + RTE_MEMBER_LOG(ERR, "Membership BF false positive rate is too small\n"); + return -EINVAL; + } + + uint32_t bits = ceil((num_keys_per_bf * + log(fp_one_bf)) / + log(1.0 / (pow(2.0, log(2.0))))); + + /* We round to power of 2 for performance during lookup */ + ss->bits = rte_align32pow2(bits); + + ss->num_hashes = (uint32_t)(log(2.0) * bits / num_keys_per_bf); + ss->bit_mask = ss->bits - 1; + + /* + * Since we round the bits to power of 2, the final false positive + * rate will probably not be same as the user specified. We log the + * new value as debug message. + */ + float new_fp = pow((1 - pow((1 - 1.0 / ss->bits), num_keys_per_bf * + ss->num_hashes)), ss->num_hashes); + new_fp = 1 - pow((1 - new_fp), ss->num_set); + + /* + * Reduce hash function count, until we approach the user specified + * false-positive rate. Otherwise it is too conservative + */ + int tmp_num_hash = ss->num_hashes; + + while (tmp_num_hash > 1) { + float tmp_fp = new_fp; + + tmp_num_hash--; + new_fp = pow((1 - pow((1 - 1.0 / ss->bits), num_keys_per_bf * + tmp_num_hash)), tmp_num_hash); + new_fp = 1 - pow((1 - new_fp), ss->num_set); + + if (new_fp > params->false_positive_rate) { + new_fp = tmp_fp; + tmp_num_hash++; + break; + } + } + + ss->num_hashes = tmp_num_hash; + + /* + * To avoid multiplication and division: + * mul_shift is used for multiplication shift during bit test + * div_shift is used for division shift, to be divided by number of bits + * represented by a uint32_t variable + */ + ss->mul_shift = __builtin_ctzl(ss->num_set); + ss->div_shift = __builtin_ctzl(32 >> ss->mul_shift); + + RTE_MEMBER_LOG(DEBUG, "vector bloom filter created, " + "each bloom filter expects %u keys, needs %u bits, %u hashes, " + "with false positive rate set as %.5f, " + "The new calculated vBF false positive rate is %.5f\n", + num_keys_per_bf, ss->bits, ss->num_hashes, fp_one_bf, new_fp); + + ss->table = rte_zmalloc_socket(NULL, ss->num_set * (ss->bits >> 3), + RTE_CACHE_LINE_SIZE, ss->socket_id); + if (ss->table == NULL) + return -ENOMEM; + + return 0; +} + +static inline uint32_t +test_bit(uint32_t bit_loc, const struct rte_member_setsum *ss) +{ + uint32_t *vbf = ss->table; + uint32_t n = ss->num_set; + uint32_t div_shift = ss->div_shift; + uint32_t mul_shift = ss->mul_shift; + /* + * a is how many bits in one BF are represented by one 32bit + * variable. + */ + uint32_t a = 32 >> mul_shift; + /* + * x>>b is the divide, x & (a-1) is the mod, & (1<> div_shift] >> + ((bit_loc & (a - 1)) << mul_shift)) & ((1ULL << n) - 1); +} + +static inline void +set_bit(uint32_t bit_loc, const struct rte_member_setsum *ss, int32_t set) +{ + uint32_t *vbf = ss->table; + uint32_t div_shift = ss->div_shift; + uint32_t mul_shift = ss->mul_shift; + uint32_t a = 32 >> mul_shift; + + vbf[bit_loc >> div_shift] |= + 1UL << (((bit_loc & (a - 1)) << mul_shift) + set - 1); +} + +int +rte_member_lookup_vbf(const struct rte_member_setsum *ss, const void *key, + member_set_t *set_id) +{ + uint32_t j; + uint32_t h1 = MEMBER_HASH_FUNC(key, ss->key_len, ss->prim_hash_seed); + uint32_t h2 = MEMBER_HASH_FUNC(&h1, sizeof(uint32_t), + ss->sec_hash_seed); + uint32_t mask = ~0; + uint32_t bit_loc; + + for (j = 0; j < ss->num_hashes; j++) { + bit_loc = (h1 + j * h2) & ss->bit_mask; + mask &= test_bit(bit_loc, ss); + } + + if (mask) { + *set_id = __builtin_ctzl(mask) + 1; + return 1; + } + + *set_id = RTE_MEMBER_NO_MATCH; + return 0; +} + +uint32_t +rte_member_lookup_bulk_vbf(const struct rte_member_setsum *ss, + const void **keys, uint32_t num_keys, member_set_t *set_ids) +{ + uint32_t i, k; + uint32_t num_matches = 0; + uint32_t mask[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t h1[RTE_MEMBER_LOOKUP_BULK_MAX], h2[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t bit_loc; + + for (i = 0; i < num_keys; i++) + h1[i] = MEMBER_HASH_FUNC(keys[i], ss->key_len, + ss->prim_hash_seed); + for (i = 0; i < num_keys; i++) + h2[i] = MEMBER_HASH_FUNC(&h1[i], sizeof(uint32_t), + ss->sec_hash_seed); + for (i = 0; i < num_keys; i++) { + mask[i] = ~0; + for (k = 0; k < ss->num_hashes; k++) { + bit_loc = (h1[i] + k * h2[i]) & ss->bit_mask; + mask[i] &= test_bit(bit_loc, ss); + } + } + for (i = 0; i < num_keys; i++) { + if (mask[i]) { + set_ids[i] = __builtin_ctzl(mask[i]) + 1; + num_matches++; + } else + set_ids[i] = RTE_MEMBER_NO_MATCH; + } + return num_matches; +} + +uint32_t +rte_member_lookup_multi_vbf(const struct rte_member_setsum *ss, + const void *key, uint32_t match_per_key, + member_set_t *set_id) +{ + uint32_t num_matches = 0; + uint32_t j; + uint32_t h1 = MEMBER_HASH_FUNC(key, ss->key_len, ss->prim_hash_seed); + uint32_t h2 = MEMBER_HASH_FUNC(&h1, sizeof(uint32_t), + ss->sec_hash_seed); + uint32_t mask = ~0; + uint32_t bit_loc; + + for (j = 0; j < ss->num_hashes; j++) { + bit_loc = (h1 + j * h2) & ss->bit_mask; + mask &= test_bit(bit_loc, ss); + } + while (mask) { + uint32_t loc = __builtin_ctzl(mask); + set_id[num_matches] = loc + 1; + num_matches++; + if (num_matches >= match_per_key) + return num_matches; + mask &= ~(1UL << loc); + } + return num_matches; +} + +uint32_t +rte_member_lookup_multi_bulk_vbf(const struct rte_member_setsum *ss, + const void **keys, uint32_t num_keys, uint32_t match_per_key, + uint32_t *match_count, + member_set_t *set_ids) +{ + uint32_t i, k; + uint32_t num_matches = 0; + uint32_t match_cnt_t; + uint32_t mask[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t h1[RTE_MEMBER_LOOKUP_BULK_MAX], h2[RTE_MEMBER_LOOKUP_BULK_MAX]; + uint32_t bit_loc; + + for (i = 0; i < num_keys; i++) + h1[i] = MEMBER_HASH_FUNC(keys[i], ss->key_len, + ss->prim_hash_seed); + for (i = 0; i < num_keys; i++) + h2[i] = MEMBER_HASH_FUNC(&h1[i], sizeof(uint32_t), + ss->sec_hash_seed); + for (i = 0; i < num_keys; i++) { + mask[i] = ~0; + for (k = 0; k < ss->num_hashes; k++) { + bit_loc = (h1[i] + k * h2[i]) & ss->bit_mask; + mask[i] &= test_bit(bit_loc, ss); + } + } + for (i = 0; i < num_keys; i++) { + match_cnt_t = 0; + while (mask[i]) { + uint32_t loc = __builtin_ctzl(mask[i]); + set_ids[i * match_per_key + match_cnt_t] = loc + 1; + match_cnt_t++; + if (match_cnt_t >= match_per_key) + break; + mask[i] &= ~(1UL << loc); + } + match_count[i] = match_cnt_t; + if (match_cnt_t != 0) + num_matches++; + } + return num_matches; +} + +int +rte_member_add_vbf(const struct rte_member_setsum *ss, + const void *key, member_set_t set_id) +{ + uint32_t i, h1, h2; + uint32_t bit_loc; + + if (set_id > ss->num_set || set_id == RTE_MEMBER_NO_MATCH) + return -EINVAL; + + h1 = MEMBER_HASH_FUNC(key, ss->key_len, ss->prim_hash_seed); + h2 = MEMBER_HASH_FUNC(&h1, sizeof(uint32_t), ss->sec_hash_seed); + + for (i = 0; i < ss->num_hashes; i++) { + bit_loc = (h1 + i * h2) & ss->bit_mask; + set_bit(bit_loc, ss, set_id); + } + return 0; +} + +void +rte_member_free_vbf(struct rte_member_setsum *ss) +{ + rte_free(ss->table); +} + +void +rte_member_reset_vbf(const struct rte_member_setsum *ss) +{ + uint32_t *vbf = ss->table; + memset(vbf, 0, (ss->num_set * ss->bits) >> 3); +} diff --git a/src/spdk/dpdk/lib/librte_member/rte_member_vbf.h b/src/spdk/dpdk/lib/librte_member/rte_member_vbf.h new file mode 100644 index 00000000..d49525d5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member_vbf.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_MEMBER_VBF_H_ +#define _RTE_MEMBER_VBF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Currently we only support up to 32 sets in vBF */ +#define RTE_MEMBER_MAX_BF 32 + +int +rte_member_create_vbf(struct rte_member_setsum *ss, + const struct rte_member_parameters *params); + +int +rte_member_lookup_vbf(const struct rte_member_setsum *setsum, + const void *key, member_set_t *set_id); + +uint32_t +rte_member_lookup_bulk_vbf(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, + member_set_t *set_ids); + +uint32_t +rte_member_lookup_multi_vbf(const struct rte_member_setsum *setsum, + const void *key, uint32_t match_per_key, + member_set_t *set_id); + +uint32_t +rte_member_lookup_multi_bulk_vbf(const struct rte_member_setsum *setsum, + const void **keys, uint32_t num_keys, uint32_t match_per_key, + uint32_t *match_count, + member_set_t *set_ids); + +int +rte_member_add_vbf(const struct rte_member_setsum *setsum, + const void *key, member_set_t set_id); + +void +rte_member_free_vbf(struct rte_member_setsum *ss); + +void +rte_member_reset_vbf(const struct rte_member_setsum *setsum); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMBER_VBF_H_ */ diff --git a/src/spdk/dpdk/lib/librte_member/rte_member_version.map b/src/spdk/dpdk/lib/librte_member/rte_member_version.map new file mode 100644 index 00000000..019e4cd9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member_version.map @@ -0,0 +1,16 @@ +DPDK_17.11 { + global: + + rte_member_add; + rte_member_create; + rte_member_delete; + rte_member_find_existing; + rte_member_free; + rte_member_lookup; + rte_member_lookup_bulk; + rte_member_lookup_multi; + rte_member_lookup_multi_bulk; + rte_member_reset; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_member/rte_member_x86.h b/src/spdk/dpdk/lib/librte_member/rte_member_x86.h new file mode 100644 index 00000000..21a498ef --- /dev/null +++ b/src/spdk/dpdk/lib/librte_member/rte_member_x86.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_MEMBER_X86_H_ +#define _RTE_MEMBER_X86_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#if defined(RTE_MACHINE_CPUFLAG_AVX2) + +static inline int +update_entry_search_avx(uint32_t bucket_id, member_sig_t tmp_sig, + struct member_ht_bucket *buckets, + member_set_t set_id) +{ + uint32_t hitmask = _mm256_movemask_epi8((__m256i)_mm256_cmpeq_epi16( + _mm256_load_si256((__m256i const *)buckets[bucket_id].sigs), + _mm256_set1_epi16(tmp_sig))); + if (hitmask) { + uint32_t hit_idx = __builtin_ctzl(hitmask) >> 1; + buckets[bucket_id].sets[hit_idx] = set_id; + return 1; + } + return 0; +} + +static inline int +search_bucket_single_avx(uint32_t bucket_id, member_sig_t tmp_sig, + struct member_ht_bucket *buckets, + member_set_t *set_id) +{ + uint32_t hitmask = _mm256_movemask_epi8((__m256i)_mm256_cmpeq_epi16( + _mm256_load_si256((__m256i const *)buckets[bucket_id].sigs), + _mm256_set1_epi16(tmp_sig))); + while (hitmask) { + uint32_t hit_idx = __builtin_ctzl(hitmask) >> 1; + if (buckets[bucket_id].sets[hit_idx] != RTE_MEMBER_NO_MATCH) { + *set_id = buckets[bucket_id].sets[hit_idx]; + return 1; + } + hitmask &= ~(3U << ((hit_idx) << 1)); + } + return 0; +} + +static inline void +search_bucket_multi_avx(uint32_t bucket_id, member_sig_t tmp_sig, + struct member_ht_bucket *buckets, + uint32_t *counter, + uint32_t match_per_key, + member_set_t *set_id) +{ + uint32_t hitmask = _mm256_movemask_epi8((__m256i)_mm256_cmpeq_epi16( + _mm256_load_si256((__m256i const *)buckets[bucket_id].sigs), + _mm256_set1_epi16(tmp_sig))); + while (hitmask) { + uint32_t hit_idx = __builtin_ctzl(hitmask) >> 1; + if (buckets[bucket_id].sets[hit_idx] != RTE_MEMBER_NO_MATCH) { + set_id[*counter] = buckets[bucket_id].sets[hit_idx]; + (*counter)++; + if (*counter >= match_per_key) + return; + } + hitmask &= ~(3U << ((hit_idx) << 1)); + } +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMBER_X86_H_ */ diff --git a/src/spdk/dpdk/lib/librte_mempool/Makefile b/src/spdk/dpdk/lib/librte_mempool/Makefile new file mode 100644 index 00000000..20bf63fb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_mempool.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +CFLAGS += -DALLOW_EXPERIMENTAL_API +LDLIBS += -lrte_eal -lrte_ring + +EXPORT_MAP := rte_mempool_version.map + +LIBABIVER := 5 + +# memseg walk is not yet part of stable API +CFLAGS += -DALLOW_EXPERIMENTAL_API + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool.c +SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool_ops.c +SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool_ops_default.c +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_mempool/meson.build b/src/spdk/dpdk/lib/librte_mempool/meson.build new file mode 100644 index 00000000..38d7ae89 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/meson.build @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +allow_experimental_apis = true + +extra_flags = [] + +foreach flag: extra_flags + if cc.has_argument(flag) + cflags += flag + endif +endforeach + +version = 5 +sources = files('rte_mempool.c', 'rte_mempool_ops.c', + 'rte_mempool_ops_default.c') +headers = files('rte_mempool.h') +deps += ['ring'] + +# memseg walk is not yet part of stable API +allow_experimental_apis = true diff --git a/src/spdk/dpdk/lib/librte_mempool/rte_mempool.c b/src/spdk/dpdk/lib/librte_mempool/rte_mempool.c new file mode 100644 index 00000000..03e6b5f7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/rte_mempool.c @@ -0,0 +1,1290 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2016 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_mempool.h" + +TAILQ_HEAD(rte_mempool_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_mempool_tailq = { + .name = "RTE_MEMPOOL", +}; +EAL_REGISTER_TAILQ(rte_mempool_tailq) + +#define CACHE_FLUSHTHRESH_MULTIPLIER 1.5 +#define CALC_CACHE_FLUSHTHRESH(c) \ + ((typeof(c))((c) * CACHE_FLUSHTHRESH_MULTIPLIER)) + +/* + * return the greatest common divisor between a and b (fast algorithm) + * + */ +static unsigned get_gcd(unsigned a, unsigned b) +{ + unsigned c; + + if (0 == a) + return b; + if (0 == b) + return a; + + if (a < b) { + c = a; + a = b; + b = c; + } + + while (b != 0) { + c = a % b; + a = b; + b = c; + } + + return a; +} + +/* + * Depending on memory configuration, objects addresses are spread + * between channels and ranks in RAM: the pool allocator will add + * padding between objects. This function return the new size of the + * object. + */ +static unsigned optimize_object_size(unsigned obj_size) +{ + unsigned nrank, nchan; + unsigned new_obj_size; + + /* get number of channels */ + nchan = rte_memory_get_nchannel(); + if (nchan == 0) + nchan = 4; + + nrank = rte_memory_get_nrank(); + if (nrank == 0) + nrank = 1; + + /* process new object size */ + new_obj_size = (obj_size + RTE_MEMPOOL_ALIGN_MASK) / RTE_MEMPOOL_ALIGN; + while (get_gcd(new_obj_size, nrank * nchan) != 1) + new_obj_size++; + return new_obj_size * RTE_MEMPOOL_ALIGN; +} + +static int +find_min_pagesz(const struct rte_memseg_list *msl, void *arg) +{ + size_t *min = arg; + + if (msl->page_sz < *min) + *min = msl->page_sz; + + return 0; +} + +static size_t +get_min_page_size(void) +{ + size_t min_pagesz = SIZE_MAX; + + rte_memseg_list_walk(find_min_pagesz, &min_pagesz); + + return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz; +} + + +static void +mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque, + void *obj, rte_iova_t iova) +{ + struct rte_mempool_objhdr *hdr; + struct rte_mempool_objtlr *tlr __rte_unused; + + /* set mempool ptr in header */ + hdr = RTE_PTR_SUB(obj, sizeof(*hdr)); + hdr->mp = mp; + hdr->iova = iova; + STAILQ_INSERT_TAIL(&mp->elt_list, hdr, next); + mp->populated_size++; + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2; + tlr = __mempool_get_trailer(obj); + tlr->cookie = RTE_MEMPOOL_TRAILER_COOKIE; +#endif +} + +/* call obj_cb() for each mempool element */ +uint32_t +rte_mempool_obj_iter(struct rte_mempool *mp, + rte_mempool_obj_cb_t *obj_cb, void *obj_cb_arg) +{ + struct rte_mempool_objhdr *hdr; + void *obj; + unsigned n = 0; + + STAILQ_FOREACH(hdr, &mp->elt_list, next) { + obj = (char *)hdr + sizeof(*hdr); + obj_cb(mp, obj_cb_arg, obj, n); + n++; + } + + return n; +} + +/* call mem_cb() for each mempool memory chunk */ +uint32_t +rte_mempool_mem_iter(struct rte_mempool *mp, + rte_mempool_mem_cb_t *mem_cb, void *mem_cb_arg) +{ + struct rte_mempool_memhdr *hdr; + unsigned n = 0; + + STAILQ_FOREACH(hdr, &mp->mem_list, next) { + mem_cb(mp, mem_cb_arg, hdr, n); + n++; + } + + return n; +} + +/* get the header, trailer and total size of a mempool element. */ +uint32_t +rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, + struct rte_mempool_objsz *sz) +{ + struct rte_mempool_objsz lsz; + + sz = (sz != NULL) ? sz : &lsz; + + sz->header_size = sizeof(struct rte_mempool_objhdr); + if ((flags & MEMPOOL_F_NO_CACHE_ALIGN) == 0) + sz->header_size = RTE_ALIGN_CEIL(sz->header_size, + RTE_MEMPOOL_ALIGN); + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + sz->trailer_size = sizeof(struct rte_mempool_objtlr); +#else + sz->trailer_size = 0; +#endif + + /* element size is 8 bytes-aligned at least */ + sz->elt_size = RTE_ALIGN_CEIL(elt_size, sizeof(uint64_t)); + + /* expand trailer to next cache line */ + if ((flags & MEMPOOL_F_NO_CACHE_ALIGN) == 0) { + sz->total_size = sz->header_size + sz->elt_size + + sz->trailer_size; + sz->trailer_size += ((RTE_MEMPOOL_ALIGN - + (sz->total_size & RTE_MEMPOOL_ALIGN_MASK)) & + RTE_MEMPOOL_ALIGN_MASK); + } + + /* + * increase trailer to add padding between objects in order to + * spread them across memory channels/ranks + */ + if ((flags & MEMPOOL_F_NO_SPREAD) == 0) { + unsigned new_size; + new_size = optimize_object_size(sz->header_size + sz->elt_size + + sz->trailer_size); + sz->trailer_size = new_size - sz->header_size - sz->elt_size; + } + + /* this is the size of an object, including header and trailer */ + sz->total_size = sz->header_size + sz->elt_size + sz->trailer_size; + + return sz->total_size; +} + +/* free a memchunk allocated with rte_memzone_reserve() */ +static void +rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr, + void *opaque) +{ + const struct rte_memzone *mz = opaque; + rte_memzone_free(mz); +} + +/* Free memory chunks used by a mempool. Objects must be in pool */ +static void +rte_mempool_free_memchunks(struct rte_mempool *mp) +{ + struct rte_mempool_memhdr *memhdr; + void *elt; + + while (!STAILQ_EMPTY(&mp->elt_list)) { + rte_mempool_ops_dequeue_bulk(mp, &elt, 1); + (void)elt; + STAILQ_REMOVE_HEAD(&mp->elt_list, next); + mp->populated_size--; + } + + while (!STAILQ_EMPTY(&mp->mem_list)) { + memhdr = STAILQ_FIRST(&mp->mem_list); + STAILQ_REMOVE_HEAD(&mp->mem_list, next); + if (memhdr->free_cb != NULL) + memhdr->free_cb(memhdr, memhdr->opaque); + rte_free(memhdr); + mp->nb_mem_chunks--; + } +} + +static int +mempool_ops_alloc_once(struct rte_mempool *mp) +{ + int ret; + + /* create the internal ring if not already done */ + if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) { + ret = rte_mempool_ops_alloc(mp); + if (ret != 0) + return ret; + mp->flags |= MEMPOOL_F_POOL_CREATED; + } + return 0; +} + +/* Add objects in the pool, using a physically contiguous memory + * zone. Return the number of objects added, or a negative value + * on error. + */ +int +rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, + rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque) +{ + unsigned i = 0; + size_t off; + struct rte_mempool_memhdr *memhdr; + int ret; + + ret = mempool_ops_alloc_once(mp); + if (ret != 0) + return ret; + + /* mempool is already populated */ + if (mp->populated_size >= mp->size) + return -ENOSPC; + + memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0); + if (memhdr == NULL) + return -ENOMEM; + + memhdr->mp = mp; + memhdr->addr = vaddr; + memhdr->iova = iova; + memhdr->len = len; + memhdr->free_cb = free_cb; + memhdr->opaque = opaque; + + if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN) + off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr; + else + off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr; + + if (off > len) { + ret = -EINVAL; + goto fail; + } + + i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size, + (char *)vaddr + off, + (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off), + len - off, mempool_add_elem, NULL); + + /* not enough room to store one object */ + if (i == 0) { + ret = -EINVAL; + goto fail; + } + + STAILQ_INSERT_TAIL(&mp->mem_list, memhdr, next); + mp->nb_mem_chunks++; + return i; + +fail: + rte_free(memhdr); + return ret; +} + +/* Populate the mempool with a virtual area. Return the number of + * objects added, or a negative value on error. + */ +int +rte_mempool_populate_virt(struct rte_mempool *mp, char *addr, + size_t len, size_t pg_sz, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque) +{ + rte_iova_t iova; + size_t off, phys_len; + int ret, cnt = 0; + + /* address and len must be page-aligned */ + if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr) + return -EINVAL; + if (RTE_ALIGN_CEIL(len, pg_sz) != len) + return -EINVAL; + + if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG) + return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA, + len, free_cb, opaque); + + for (off = 0; off + pg_sz <= len && + mp->populated_size < mp->size; off += phys_len) { + + iova = rte_mem_virt2iova(addr + off); + + if (iova == RTE_BAD_IOVA && rte_eal_has_hugepages()) { + ret = -EINVAL; + goto fail; + } + + /* populate with the largest group of contiguous pages */ + for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) { + rte_iova_t iova_tmp; + + iova_tmp = rte_mem_virt2iova(addr + off + phys_len); + + if (iova_tmp != iova + phys_len) + break; + } + + ret = rte_mempool_populate_iova(mp, addr + off, iova, + phys_len, free_cb, opaque); + if (ret < 0) + goto fail; + /* no need to call the free callback for next chunks */ + free_cb = NULL; + cnt += ret; + } + + return cnt; + + fail: + rte_mempool_free_memchunks(mp); + return ret; +} + +/* Default function to populate the mempool: allocate memory in memzones, + * and populate them. Return the number of objects added, or a negative + * value on error. + */ +int +rte_mempool_populate_default(struct rte_mempool *mp) +{ + unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; + char mz_name[RTE_MEMZONE_NAMESIZE]; + const struct rte_memzone *mz; + ssize_t mem_size; + size_t align, pg_sz, pg_shift; + rte_iova_t iova; + unsigned mz_id, n; + int ret; + bool no_contig, try_contig, no_pageshift; + + ret = mempool_ops_alloc_once(mp); + if (ret != 0) + return ret; + + /* mempool must not be populated */ + if (mp->nb_mem_chunks != 0) + return -EEXIST; + + no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG; + + /* + * the following section calculates page shift and page size values. + * + * these values impact the result of calc_mem_size operation, which + * returns the amount of memory that should be allocated to store the + * desired number of objects. when not zero, it allocates more memory + * for the padding between objects, to ensure that an object does not + * cross a page boundary. in other words, page size/shift are to be set + * to zero if mempool elements won't care about page boundaries. + * there are several considerations for page size and page shift here. + * + * if we don't need our mempools to have physically contiguous objects, + * then just set page shift and page size to 0, because the user has + * indicated that there's no need to care about anything. + * + * if we do need contiguous objects, there is also an option to reserve + * the entire mempool memory as one contiguous block of memory, in + * which case the page shift and alignment wouldn't matter as well. + * + * if we require contiguous objects, but not necessarily the entire + * mempool reserved space to be contiguous, then there are two options. + * + * if our IO addresses are virtual, not actual physical (IOVA as VA + * case), then no page shift needed - our memory allocation will give us + * contiguous IO memory as far as the hardware is concerned, so + * act as if we're getting contiguous memory. + * + * if our IO addresses are physical, we may get memory from bigger + * pages, or we might get memory from smaller pages, and how much of it + * we require depends on whether we want bigger or smaller pages. + * However, requesting each and every memory size is too much work, so + * what we'll do instead is walk through the page sizes available, pick + * the smallest one and set up page shift to match that one. We will be + * wasting some space this way, but it's much nicer than looping around + * trying to reserve each and every page size. + * + * However, since size calculation will produce page-aligned sizes, it + * makes sense to first try and see if we can reserve the entire memzone + * in one contiguous chunk as well (otherwise we might end up wasting a + * 1G page on a 10MB memzone). If we fail to get enough contiguous + * memory, then we'll go and reserve space page-by-page. + */ + no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA; + try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages(); + + if (no_pageshift) { + pg_sz = 0; + pg_shift = 0; + } else if (try_contig) { + pg_sz = get_min_page_size(); + pg_shift = rte_bsf32(pg_sz); + } else { + pg_sz = getpagesize(); + pg_shift = rte_bsf32(pg_sz); + } + + for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) { + size_t min_chunk_size; + unsigned int flags; + + if (try_contig || no_pageshift) + mem_size = rte_mempool_ops_calc_mem_size(mp, n, + 0, &min_chunk_size, &align); + else + mem_size = rte_mempool_ops_calc_mem_size(mp, n, + pg_shift, &min_chunk_size, &align); + + if (mem_size < 0) { + ret = mem_size; + goto fail; + } + + ret = snprintf(mz_name, sizeof(mz_name), + RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + ret = -ENAMETOOLONG; + goto fail; + } + + flags = mz_flags; + + /* if we're trying to reserve contiguous memory, add appropriate + * memzone flag. + */ + if (try_contig) + flags |= RTE_MEMZONE_IOVA_CONTIG; + + mz = rte_memzone_reserve_aligned(mz_name, mem_size, + mp->socket_id, flags, align); + + /* if we were trying to allocate contiguous memory, failed and + * minimum required contiguous chunk fits minimum page, adjust + * memzone size to the page size, and try again. + */ + if (mz == NULL && try_contig && min_chunk_size <= pg_sz) { + try_contig = false; + flags &= ~RTE_MEMZONE_IOVA_CONTIG; + + mem_size = rte_mempool_ops_calc_mem_size(mp, n, + pg_shift, &min_chunk_size, &align); + if (mem_size < 0) { + ret = mem_size; + goto fail; + } + + mz = rte_memzone_reserve_aligned(mz_name, mem_size, + mp->socket_id, flags, align); + } + /* don't try reserving with 0 size if we were asked to reserve + * IOVA-contiguous memory. + */ + if (min_chunk_size < (size_t)mem_size && mz == NULL) { + /* not enough memory, retry with the biggest zone we + * have + */ + mz = rte_memzone_reserve_aligned(mz_name, 0, + mp->socket_id, flags, + RTE_MAX(pg_sz, align)); + } + if (mz == NULL) { + ret = -rte_errno; + goto fail; + } + + if (mz->len < min_chunk_size) { + rte_memzone_free(mz); + ret = -ENOMEM; + goto fail; + } + + if (no_contig) + iova = RTE_BAD_IOVA; + else + iova = mz->iova; + + if (no_pageshift || try_contig) + ret = rte_mempool_populate_iova(mp, mz->addr, + iova, mz->len, + rte_mempool_memchunk_mz_free, + (void *)(uintptr_t)mz); + else + ret = rte_mempool_populate_virt(mp, mz->addr, + RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz, + rte_mempool_memchunk_mz_free, + (void *)(uintptr_t)mz); + if (ret < 0) { + rte_memzone_free(mz); + goto fail; + } + } + + return mp->size; + + fail: + rte_mempool_free_memchunks(mp); + return ret; +} + +/* return the memory size required for mempool objects in anonymous mem */ +static ssize_t +get_anon_size(const struct rte_mempool *mp) +{ + ssize_t size; + size_t pg_sz, pg_shift; + size_t min_chunk_size; + size_t align; + + pg_sz = getpagesize(); + pg_shift = rte_bsf32(pg_sz); + size = rte_mempool_ops_calc_mem_size(mp, mp->size, pg_shift, + &min_chunk_size, &align); + + return size; +} + +/* unmap a memory zone mapped by rte_mempool_populate_anon() */ +static void +rte_mempool_memchunk_anon_free(struct rte_mempool_memhdr *memhdr, + void *opaque) +{ + ssize_t size; + + /* + * Calculate size since memhdr->len has contiguous chunk length + * which may be smaller if anon map is split into many contiguous + * chunks. Result must be the same as we calculated on populate. + */ + size = get_anon_size(memhdr->mp); + if (size < 0) + return; + + munmap(opaque, size); +} + +/* populate the mempool with an anonymous mapping */ +int +rte_mempool_populate_anon(struct rte_mempool *mp) +{ + ssize_t size; + int ret; + char *addr; + + /* mempool is already populated, error */ + if ((!STAILQ_EMPTY(&mp->mem_list)) || mp->nb_mem_chunks != 0) { + rte_errno = EINVAL; + return 0; + } + + ret = mempool_ops_alloc_once(mp); + if (ret != 0) + return ret; + + size = get_anon_size(mp); + if (size < 0) { + rte_errno = -size; + return 0; + } + + /* get chunk of virtually continuous memory */ + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + rte_errno = errno; + return 0; + } + /* can't use MMAP_LOCKED, it does not exist on BSD */ + if (mlock(addr, size) < 0) { + rte_errno = errno; + munmap(addr, size); + return 0; + } + + ret = rte_mempool_populate_virt(mp, addr, size, getpagesize(), + rte_mempool_memchunk_anon_free, addr); + if (ret == 0) + goto fail; + + return mp->populated_size; + + fail: + rte_mempool_free_memchunks(mp); + return 0; +} + +/* free a mempool */ +void +rte_mempool_free(struct rte_mempool *mp) +{ + struct rte_mempool_list *mempool_list = NULL; + struct rte_tailq_entry *te; + + if (mp == NULL) + return; + + mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + /* find out tailq entry */ + TAILQ_FOREACH(te, mempool_list, next) { + if (te->data == (void *)mp) + break; + } + + if (te != NULL) { + TAILQ_REMOVE(mempool_list, te, next); + rte_free(te); + } + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_mempool_free_memchunks(mp); + rte_mempool_ops_free(mp); + rte_memzone_free(mp->mz); +} + +static void +mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size) +{ + cache->size = size; + cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size); + cache->len = 0; +} + +/* + * Create and initialize a cache for objects that are retrieved from and + * returned to an underlying mempool. This structure is identical to the + * local_cache[lcore_id] pointed to by the mempool structure. + */ +struct rte_mempool_cache * +rte_mempool_cache_create(uint32_t size, int socket_id) +{ + struct rte_mempool_cache *cache; + + if (size == 0 || size > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_errno = EINVAL; + return NULL; + } + + cache = rte_zmalloc_socket("MEMPOOL_CACHE", sizeof(*cache), + RTE_CACHE_LINE_SIZE, socket_id); + if (cache == NULL) { + RTE_LOG(ERR, MEMPOOL, "Cannot allocate mempool cache.\n"); + rte_errno = ENOMEM; + return NULL; + } + + mempool_cache_init(cache, size); + + return cache; +} + +/* + * Free a cache. It's the responsibility of the user to make sure that any + * remaining objects in the cache are flushed to the corresponding + * mempool. + */ +void +rte_mempool_cache_free(struct rte_mempool_cache *cache) +{ + rte_free(cache); +} + +/* create an empty mempool */ +struct rte_mempool * +rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + int socket_id, unsigned flags) +{ + char mz_name[RTE_MEMZONE_NAMESIZE]; + struct rte_mempool_list *mempool_list; + struct rte_mempool *mp = NULL; + struct rte_tailq_entry *te = NULL; + const struct rte_memzone *mz = NULL; + size_t mempool_size; + unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; + struct rte_mempool_objsz objsz; + unsigned lcore_id; + int ret; + + /* compilation-time checks */ + RTE_BUILD_BUG_ON((sizeof(struct rte_mempool) & + RTE_CACHE_LINE_MASK) != 0); + RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) & + RTE_CACHE_LINE_MASK) != 0); +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) & + RTE_CACHE_LINE_MASK) != 0); + RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) & + RTE_CACHE_LINE_MASK) != 0); +#endif + + mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + + /* asked for zero items */ + if (n == 0) { + rte_errno = EINVAL; + return NULL; + } + + /* asked cache too big */ + if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE || + CALC_CACHE_FLUSHTHRESH(cache_size) > n) { + rte_errno = EINVAL; + return NULL; + } + + /* "no cache align" imply "no spread" */ + if (flags & MEMPOOL_F_NO_CACHE_ALIGN) + flags |= MEMPOOL_F_NO_SPREAD; + + /* calculate mempool object sizes. */ + if (!rte_mempool_calc_obj_size(elt_size, flags, &objsz)) { + rte_errno = EINVAL; + return NULL; + } + + rte_rwlock_write_lock(RTE_EAL_MEMPOOL_RWLOCK); + + /* + * reserve a memory zone for this mempool: private data is + * cache-aligned + */ + private_data_size = (private_data_size + + RTE_MEMPOOL_ALIGN_MASK) & (~RTE_MEMPOOL_ALIGN_MASK); + + + /* try to allocate tailq entry */ + te = rte_zmalloc("MEMPOOL_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, MEMPOOL, "Cannot allocate tailq entry!\n"); + goto exit_unlock; + } + + mempool_size = MEMPOOL_HEADER_SIZE(mp, cache_size); + mempool_size += private_data_size; + mempool_size = RTE_ALIGN_CEIL(mempool_size, RTE_MEMPOOL_ALIGN); + + ret = snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, name); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + rte_errno = ENAMETOOLONG; + goto exit_unlock; + } + + mz = rte_memzone_reserve(mz_name, mempool_size, socket_id, mz_flags); + if (mz == NULL) + goto exit_unlock; + + /* init the mempool structure */ + mp = mz->addr; + memset(mp, 0, MEMPOOL_HEADER_SIZE(mp, cache_size)); + ret = snprintf(mp->name, sizeof(mp->name), "%s", name); + if (ret < 0 || ret >= (int)sizeof(mp->name)) { + rte_errno = ENAMETOOLONG; + goto exit_unlock; + } + mp->mz = mz; + mp->size = n; + mp->flags = flags; + mp->socket_id = socket_id; + mp->elt_size = objsz.elt_size; + mp->header_size = objsz.header_size; + mp->trailer_size = objsz.trailer_size; + /* Size of default caches, zero means disabled. */ + mp->cache_size = cache_size; + mp->private_data_size = private_data_size; + STAILQ_INIT(&mp->elt_list); + STAILQ_INIT(&mp->mem_list); + + /* + * local_cache pointer is set even if cache_size is zero. + * The local_cache points to just past the elt_pa[] array. + */ + mp->local_cache = (struct rte_mempool_cache *) + RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0)); + + /* Init all default caches. */ + if (cache_size != 0) { + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + mempool_cache_init(&mp->local_cache[lcore_id], + cache_size); + } + + te->data = mp; + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_INSERT_TAIL(mempool_list, te, next); + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + rte_rwlock_write_unlock(RTE_EAL_MEMPOOL_RWLOCK); + + return mp; + +exit_unlock: + rte_rwlock_write_unlock(RTE_EAL_MEMPOOL_RWLOCK); + rte_free(te); + rte_mempool_free(mp); + return NULL; +} + +/* create the mempool */ +struct rte_mempool * +rte_mempool_create(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + rte_mempool_ctor_t *mp_init, void *mp_init_arg, + rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, + int socket_id, unsigned flags) +{ + int ret; + struct rte_mempool *mp; + + mp = rte_mempool_create_empty(name, n, elt_size, cache_size, + private_data_size, socket_id, flags); + if (mp == NULL) + return NULL; + + /* + * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to + * set the correct index into the table of ops structs. + */ + if ((flags & MEMPOOL_F_SP_PUT) && (flags & MEMPOOL_F_SC_GET)) + ret = rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL); + else if (flags & MEMPOOL_F_SP_PUT) + ret = rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL); + else if (flags & MEMPOOL_F_SC_GET) + ret = rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL); + else + ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL); + + if (ret) + goto fail; + + /* call the mempool priv initializer */ + if (mp_init) + mp_init(mp, mp_init_arg); + + if (rte_mempool_populate_default(mp) < 0) + goto fail; + + /* call the object initializers */ + if (obj_init) + rte_mempool_obj_iter(mp, obj_init, obj_init_arg); + + return mp; + + fail: + rte_mempool_free(mp); + return NULL; +} + +/* Return the number of entries in the mempool */ +unsigned int +rte_mempool_avail_count(const struct rte_mempool *mp) +{ + unsigned count; + unsigned lcore_id; + + count = rte_mempool_ops_get_count(mp); + + if (mp->cache_size == 0) + return count; + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + count += mp->local_cache[lcore_id].len; + + /* + * due to race condition (access to len is not locked), the + * total can be greater than size... so fix the result + */ + if (count > mp->size) + return mp->size; + return count; +} + +/* return the number of entries allocated from the mempool */ +unsigned int +rte_mempool_in_use_count(const struct rte_mempool *mp) +{ + return mp->size - rte_mempool_avail_count(mp); +} + +/* dump the cache status */ +static unsigned +rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp) +{ + unsigned lcore_id; + unsigned count = 0; + unsigned cache_count; + + fprintf(f, " internal cache infos:\n"); + fprintf(f, " cache_size=%"PRIu32"\n", mp->cache_size); + + if (mp->cache_size == 0) + return count; + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + cache_count = mp->local_cache[lcore_id].len; + fprintf(f, " cache_count[%u]=%"PRIu32"\n", + lcore_id, cache_count); + count += cache_count; + } + fprintf(f, " total_cache_count=%u\n", count); + return count; +} + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +/* check and update cookies or panic (internal) */ +void rte_mempool_check_cookies(const struct rte_mempool *mp, + void * const *obj_table_const, unsigned n, int free) +{ +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + struct rte_mempool_objhdr *hdr; + struct rte_mempool_objtlr *tlr; + uint64_t cookie; + void *tmp; + void *obj; + void **obj_table; + + /* Force to drop the "const" attribute. This is done only when + * DEBUG is enabled */ + tmp = (void *) obj_table_const; + obj_table = tmp; + + while (n--) { + obj = obj_table[n]; + + if (rte_mempool_from_obj(obj) != mp) + rte_panic("MEMPOOL: object is owned by another " + "mempool\n"); + + hdr = __mempool_get_header(obj); + cookie = hdr->cookie; + + if (free == 0) { + if (cookie != RTE_MEMPOOL_HEADER_COOKIE1) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad header cookie (put)\n"); + } + hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2; + } else if (free == 1) { + if (cookie != RTE_MEMPOOL_HEADER_COOKIE2) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad header cookie (get)\n"); + } + hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE1; + } else if (free == 2) { + if (cookie != RTE_MEMPOOL_HEADER_COOKIE1 && + cookie != RTE_MEMPOOL_HEADER_COOKIE2) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad header cookie (audit)\n"); + } + } + tlr = __mempool_get_trailer(obj); + cookie = tlr->cookie; + if (cookie != RTE_MEMPOOL_TRAILER_COOKIE) { + RTE_LOG(CRIT, MEMPOOL, + "obj=%p, mempool=%p, cookie=%" PRIx64 "\n", + obj, (const void *) mp, cookie); + rte_panic("MEMPOOL: bad trailer cookie\n"); + } + } +#else + RTE_SET_USED(mp); + RTE_SET_USED(obj_table_const); + RTE_SET_USED(n); + RTE_SET_USED(free); +#endif +} + +void +rte_mempool_contig_blocks_check_cookies(const struct rte_mempool *mp, + void * const *first_obj_table_const, unsigned int n, int free) +{ +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + struct rte_mempool_info info; + const size_t total_elt_sz = + mp->header_size + mp->elt_size + mp->trailer_size; + unsigned int i, j; + + rte_mempool_ops_get_info(mp, &info); + + for (i = 0; i < n; ++i) { + void *first_obj = first_obj_table_const[i]; + + for (j = 0; j < info.contig_block_size; ++j) { + void *obj; + + obj = (void *)((uintptr_t)first_obj + j * total_elt_sz); + rte_mempool_check_cookies(mp, &obj, 1, free); + } + } +#else + RTE_SET_USED(mp); + RTE_SET_USED(first_obj_table_const); + RTE_SET_USED(n); + RTE_SET_USED(free); +#endif +} + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +static void +mempool_obj_audit(struct rte_mempool *mp, __rte_unused void *opaque, + void *obj, __rte_unused unsigned idx) +{ + __mempool_check_cookies(mp, &obj, 1, 2); +} + +static void +mempool_audit_cookies(struct rte_mempool *mp) +{ + unsigned num; + + num = rte_mempool_obj_iter(mp, mempool_obj_audit, NULL); + if (num != mp->size) { + rte_panic("rte_mempool_obj_iter(mempool=%p, size=%u) " + "iterated only over %u elements\n", + mp, mp->size, num); + } +} +#else +#define mempool_audit_cookies(mp) do {} while(0) +#endif + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic error "-Wcast-qual" +#endif + +/* check cookies before and after objects */ +static void +mempool_audit_cache(const struct rte_mempool *mp) +{ + /* check cache size consistency */ + unsigned lcore_id; + + if (mp->cache_size == 0) + return; + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + const struct rte_mempool_cache *cache; + cache = &mp->local_cache[lcore_id]; + if (cache->len > cache->flushthresh) { + RTE_LOG(CRIT, MEMPOOL, "badness on cache[%u]\n", + lcore_id); + rte_panic("MEMPOOL: invalid cache len\n"); + } + } +} + +/* check the consistency of mempool (size, cookies, ...) */ +void +rte_mempool_audit(struct rte_mempool *mp) +{ + mempool_audit_cache(mp); + mempool_audit_cookies(mp); + + /* For case where mempool DEBUG is not set, and cache size is 0 */ + RTE_SET_USED(mp); +} + +/* dump the status of the mempool on the console */ +void +rte_mempool_dump(FILE *f, struct rte_mempool *mp) +{ +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + struct rte_mempool_info info; + struct rte_mempool_debug_stats sum; + unsigned lcore_id; +#endif + struct rte_mempool_memhdr *memhdr; + unsigned common_count; + unsigned cache_count; + size_t mem_len = 0; + + RTE_ASSERT(f != NULL); + RTE_ASSERT(mp != NULL); + + fprintf(f, "mempool <%s>@%p\n", mp->name, mp); + fprintf(f, " flags=%x\n", mp->flags); + fprintf(f, " pool=%p\n", mp->pool_data); + fprintf(f, " iova=0x%" PRIx64 "\n", mp->mz->iova); + fprintf(f, " nb_mem_chunks=%u\n", mp->nb_mem_chunks); + fprintf(f, " size=%"PRIu32"\n", mp->size); + fprintf(f, " populated_size=%"PRIu32"\n", mp->populated_size); + fprintf(f, " header_size=%"PRIu32"\n", mp->header_size); + fprintf(f, " elt_size=%"PRIu32"\n", mp->elt_size); + fprintf(f, " trailer_size=%"PRIu32"\n", mp->trailer_size); + fprintf(f, " total_obj_size=%"PRIu32"\n", + mp->header_size + mp->elt_size + mp->trailer_size); + + fprintf(f, " private_data_size=%"PRIu32"\n", mp->private_data_size); + + STAILQ_FOREACH(memhdr, &mp->mem_list, next) + mem_len += memhdr->len; + if (mem_len != 0) { + fprintf(f, " avg bytes/object=%#Lf\n", + (long double)mem_len / mp->size); + } + + cache_count = rte_mempool_dump_cache(f, mp); + common_count = rte_mempool_ops_get_count(mp); + if ((cache_count + common_count) > mp->size) + common_count = mp->size - cache_count; + fprintf(f, " common_pool_count=%u\n", common_count); + + /* sum and dump statistics */ +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + rte_mempool_ops_get_info(mp, &info); + memset(&sum, 0, sizeof(sum)); + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + sum.put_bulk += mp->stats[lcore_id].put_bulk; + sum.put_objs += mp->stats[lcore_id].put_objs; + sum.get_success_bulk += mp->stats[lcore_id].get_success_bulk; + sum.get_success_objs += mp->stats[lcore_id].get_success_objs; + sum.get_fail_bulk += mp->stats[lcore_id].get_fail_bulk; + sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs; + sum.get_success_blks += mp->stats[lcore_id].get_success_blks; + sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks; + } + fprintf(f, " stats:\n"); + fprintf(f, " put_bulk=%"PRIu64"\n", sum.put_bulk); + fprintf(f, " put_objs=%"PRIu64"\n", sum.put_objs); + fprintf(f, " get_success_bulk=%"PRIu64"\n", sum.get_success_bulk); + fprintf(f, " get_success_objs=%"PRIu64"\n", sum.get_success_objs); + fprintf(f, " get_fail_bulk=%"PRIu64"\n", sum.get_fail_bulk); + fprintf(f, " get_fail_objs=%"PRIu64"\n", sum.get_fail_objs); + if (info.contig_block_size > 0) { + fprintf(f, " get_success_blks=%"PRIu64"\n", + sum.get_success_blks); + fprintf(f, " get_fail_blks=%"PRIu64"\n", sum.get_fail_blks); + } +#else + fprintf(f, " no statistics available\n"); +#endif + + rte_mempool_audit(mp); +} + +/* dump the status of all mempools on the console */ +void +rte_mempool_list_dump(FILE *f) +{ + struct rte_mempool *mp = NULL; + struct rte_tailq_entry *te; + struct rte_mempool_list *mempool_list; + + mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + + rte_rwlock_read_lock(RTE_EAL_MEMPOOL_RWLOCK); + + TAILQ_FOREACH(te, mempool_list, next) { + mp = (struct rte_mempool *) te->data; + rte_mempool_dump(f, mp); + } + + rte_rwlock_read_unlock(RTE_EAL_MEMPOOL_RWLOCK); +} + +/* search a mempool from its name */ +struct rte_mempool * +rte_mempool_lookup(const char *name) +{ + struct rte_mempool *mp = NULL; + struct rte_tailq_entry *te; + struct rte_mempool_list *mempool_list; + + mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + + rte_rwlock_read_lock(RTE_EAL_MEMPOOL_RWLOCK); + + TAILQ_FOREACH(te, mempool_list, next) { + mp = (struct rte_mempool *) te->data; + if (strncmp(name, mp->name, RTE_MEMPOOL_NAMESIZE) == 0) + break; + } + + rte_rwlock_read_unlock(RTE_EAL_MEMPOOL_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return mp; +} + +void rte_mempool_walk(void (*func)(struct rte_mempool *, void *), + void *arg) +{ + struct rte_tailq_entry *te = NULL; + struct rte_mempool_list *mempool_list; + void *tmp_te; + + mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list); + + rte_rwlock_read_lock(RTE_EAL_MEMPOOL_RWLOCK); + + TAILQ_FOREACH_SAFE(te, mempool_list, next, tmp_te) { + (*func)((struct rte_mempool *) te->data, arg); + } + + rte_rwlock_read_unlock(RTE_EAL_MEMPOOL_RWLOCK); +} diff --git a/src/spdk/dpdk/lib/librte_mempool/rte_mempool.h b/src/spdk/dpdk/lib/librte_mempool/rte_mempool.h new file mode 100644 index 00000000..7c9cd9a2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/rte_mempool.h @@ -0,0 +1,1700 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2016 6WIND S.A. + */ + +#ifndef _RTE_MEMPOOL_H_ +#define _RTE_MEMPOOL_H_ + +/** + * @file + * RTE Mempool. + * + * A memory pool is an allocator of fixed-size object. It is + * identified by its name, and uses a ring to store free objects. It + * provides some other optional services, like a per-core object + * cache, and an alignment helper to ensure that objects are padded + * to spread them equally on all RAM channels, ranks, and so on. + * + * Objects owned by a mempool should never be added in another + * mempool. When an object is freed using rte_mempool_put() or + * equivalent, the object data is not modified; the user can save some + * meta-data in the object data and retrieve them when allocating a + * new object. + * + * Note: the mempool implementation is not preemptible. An lcore must not be + * interrupted by another task that uses the same mempool (because it uses a + * ring which is not preemptible). Also, usual mempool functions like + * rte_mempool_get() or rte_mempool_put() are designed to be called from an EAL + * thread due to the internal per-lcore cache. Due to the lack of caching, + * rte_mempool_get() or rte_mempool_put() performance will suffer when called + * by non-EAL threads. Instead, non-EAL threads should call + * rte_mempool_generic_get() or rte_mempool_generic_put() with a user cache + * created with rte_mempool_cache_create(). + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_MEMPOOL_HEADER_COOKIE1 0xbadbadbadadd2e55ULL /**< Header cookie. */ +#define RTE_MEMPOOL_HEADER_COOKIE2 0xf2eef2eedadd2e55ULL /**< Header cookie. */ +#define RTE_MEMPOOL_TRAILER_COOKIE 0xadd2e55badbadbadULL /**< Trailer cookie.*/ + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +/** + * A structure that stores the mempool statistics (per-lcore). + */ +struct rte_mempool_debug_stats { + uint64_t put_bulk; /**< Number of puts. */ + uint64_t put_objs; /**< Number of objects successfully put. */ + uint64_t get_success_bulk; /**< Successful allocation number. */ + uint64_t get_success_objs; /**< Objects successfully allocated. */ + uint64_t get_fail_bulk; /**< Failed allocation number. */ + uint64_t get_fail_objs; /**< Objects that failed to be allocated. */ + /** Successful allocation number of contiguous blocks. */ + uint64_t get_success_blks; + /** Failed allocation number of contiguous blocks. */ + uint64_t get_fail_blks; +} __rte_cache_aligned; +#endif + +/** + * A structure that stores a per-core object cache. + */ +struct rte_mempool_cache { + uint32_t size; /**< Size of the cache */ + uint32_t flushthresh; /**< Threshold before we flush excess elements */ + uint32_t len; /**< Current cache count */ + /* + * Cache is allocated to this size to allow it to overflow in certain + * cases to avoid needless emptying of cache. + */ + void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */ +} __rte_cache_aligned; + +/** + * A structure that stores the size of mempool elements. + */ +struct rte_mempool_objsz { + uint32_t elt_size; /**< Size of an element. */ + uint32_t header_size; /**< Size of header (before elt). */ + uint32_t trailer_size; /**< Size of trailer (after elt). */ + uint32_t total_size; + /**< Total size of an object (header + elt + trailer). */ +}; + +/**< Maximum length of a memory pool's name. */ +#define RTE_MEMPOOL_NAMESIZE (RTE_RING_NAMESIZE - \ + sizeof(RTE_MEMPOOL_MZ_PREFIX) + 1) +#define RTE_MEMPOOL_MZ_PREFIX "MP_" + +/* "MP_" */ +#define RTE_MEMPOOL_MZ_FORMAT RTE_MEMPOOL_MZ_PREFIX "%s" + +#define MEMPOOL_PG_SHIFT_MAX (sizeof(uintptr_t) * CHAR_BIT - 1) + +/** Mempool over one chunk of physically continuous memory */ +#define MEMPOOL_PG_NUM_DEFAULT 1 + +#ifndef RTE_MEMPOOL_ALIGN +#define RTE_MEMPOOL_ALIGN RTE_CACHE_LINE_SIZE +#endif + +#define RTE_MEMPOOL_ALIGN_MASK (RTE_MEMPOOL_ALIGN - 1) + +/** + * Mempool object header structure + * + * Each object stored in mempools are prefixed by this header structure, + * it allows to retrieve the mempool pointer from the object and to + * iterate on all objects attached to a mempool. When debug is enabled, + * a cookie is also added in this structure preventing corruptions and + * double-frees. + */ +struct rte_mempool_objhdr { + STAILQ_ENTRY(rte_mempool_objhdr) next; /**< Next in list. */ + struct rte_mempool *mp; /**< The mempool owning the object. */ + RTE_STD_C11 + union { + rte_iova_t iova; /**< IO address of the object. */ + phys_addr_t physaddr; /**< deprecated - Physical address of the object. */ + }; +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + uint64_t cookie; /**< Debug cookie. */ +#endif +}; + +/** + * A list of object headers type + */ +STAILQ_HEAD(rte_mempool_objhdr_list, rte_mempool_objhdr); + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + +/** + * Mempool object trailer structure + * + * In debug mode, each object stored in mempools are suffixed by this + * trailer structure containing a cookie preventing memory corruptions. + */ +struct rte_mempool_objtlr { + uint64_t cookie; /**< Debug cookie. */ +}; + +#endif + +/** + * A list of memory where objects are stored + */ +STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr); + +/** + * Callback used to free a memory chunk + */ +typedef void (rte_mempool_memchunk_free_cb_t)(struct rte_mempool_memhdr *memhdr, + void *opaque); + +/** + * Mempool objects memory header structure + * + * The memory chunks where objects are stored. Each chunk is virtually + * and physically contiguous. + */ +struct rte_mempool_memhdr { + STAILQ_ENTRY(rte_mempool_memhdr) next; /**< Next in list. */ + struct rte_mempool *mp; /**< The mempool owning the chunk */ + void *addr; /**< Virtual address of the chunk */ + RTE_STD_C11 + union { + rte_iova_t iova; /**< IO address of the chunk */ + phys_addr_t phys_addr; /**< Physical address of the chunk */ + }; + size_t len; /**< length of the chunk */ + rte_mempool_memchunk_free_cb_t *free_cb; /**< Free callback */ + void *opaque; /**< Argument passed to the free callback */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Additional information about the mempool + * + * The structure is cache-line aligned to avoid ABI breakages in + * a number of cases when something small is added. + */ +struct rte_mempool_info { + /** Number of objects in the contiguous block */ + unsigned int contig_block_size; +} __rte_cache_aligned; + +/** + * The RTE mempool structure. + */ +struct rte_mempool { + /* + * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to ABI + * compatibility requirements, it could be changed to + * RTE_MEMPOOL_NAMESIZE next time the ABI changes + */ + char name[RTE_MEMZONE_NAMESIZE]; /**< Name of mempool. */ + RTE_STD_C11 + union { + void *pool_data; /**< Ring or pool to store objects. */ + uint64_t pool_id; /**< External mempool identifier. */ + }; + void *pool_config; /**< optional args for ops alloc. */ + const struct rte_memzone *mz; /**< Memzone where pool is alloc'd. */ + unsigned int flags; /**< Flags of the mempool. */ + int socket_id; /**< Socket id passed at create. */ + uint32_t size; /**< Max size of the mempool. */ + uint32_t cache_size; + /**< Size of per-lcore default local cache. */ + + uint32_t elt_size; /**< Size of an element. */ + uint32_t header_size; /**< Size of header (before elt). */ + uint32_t trailer_size; /**< Size of trailer (after elt). */ + + unsigned private_data_size; /**< Size of private data. */ + /** + * Index into rte_mempool_ops_table array of mempool ops + * structs, which contain callback function pointers. + * We're using an index here rather than pointers to the callbacks + * to facilitate any secondary processes that may want to use + * this mempool. + */ + int32_t ops_index; + + struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */ + + uint32_t populated_size; /**< Number of populated objects. */ + struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */ + uint32_t nb_mem_chunks; /**< Number of memory chunks */ + struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */ + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + /** Per-lcore statistics. */ + struct rte_mempool_debug_stats stats[RTE_MAX_LCORE]; +#endif +} __rte_cache_aligned; + +#define MEMPOOL_F_NO_SPREAD 0x0001 /**< Do not spread among memory channels. */ +#define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/ +#define MEMPOOL_F_SP_PUT 0x0004 /**< Default put is "single-producer".*/ +#define MEMPOOL_F_SC_GET 0x0008 /**< Default get is "single-consumer".*/ +#define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */ +#define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */ +#define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */ + +/** + * @internal When debug is enabled, store some statistics. + * + * @param mp + * Pointer to the memory pool. + * @param name + * Name of the statistics field to increment in the memory pool. + * @param n + * Number to add to the object-oriented statistics. + */ +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +#define __MEMPOOL_STAT_ADD(mp, name, n) do { \ + unsigned __lcore_id = rte_lcore_id(); \ + if (__lcore_id < RTE_MAX_LCORE) { \ + mp->stats[__lcore_id].name##_objs += n; \ + mp->stats[__lcore_id].name##_bulk += 1; \ + } \ + } while(0) +#define __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, name, n) do { \ + unsigned int __lcore_id = rte_lcore_id(); \ + if (__lcore_id < RTE_MAX_LCORE) { \ + mp->stats[__lcore_id].name##_blks += n; \ + mp->stats[__lcore_id].name##_bulk += 1; \ + } \ + } while (0) +#else +#define __MEMPOOL_STAT_ADD(mp, name, n) do {} while(0) +#define __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, name, n) do {} while (0) +#endif + +/** + * Calculate the size of the mempool header. + * + * @param mp + * Pointer to the memory pool. + * @param cs + * Size of the per-lcore cache. + */ +#define MEMPOOL_HEADER_SIZE(mp, cs) \ + (sizeof(*(mp)) + (((cs) == 0) ? 0 : \ + (sizeof(struct rte_mempool_cache) * RTE_MAX_LCORE))) + +/* return the header of a mempool object (internal) */ +static inline struct rte_mempool_objhdr *__mempool_get_header(void *obj) +{ + return (struct rte_mempool_objhdr *)RTE_PTR_SUB(obj, + sizeof(struct rte_mempool_objhdr)); +} + +/** + * Return a pointer to the mempool owning this object. + * + * @param obj + * An object that is owned by a pool. If this is not the case, + * the behavior is undefined. + * @return + * A pointer to the mempool structure. + */ +static inline struct rte_mempool *rte_mempool_from_obj(void *obj) +{ + struct rte_mempool_objhdr *hdr = __mempool_get_header(obj); + return hdr->mp; +} + +/* return the trailer of a mempool object (internal) */ +static inline struct rte_mempool_objtlr *__mempool_get_trailer(void *obj) +{ + struct rte_mempool *mp = rte_mempool_from_obj(obj); + return (struct rte_mempool_objtlr *)RTE_PTR_ADD(obj, mp->elt_size); +} + +/** + * @internal Check and update cookies or panic. + * + * @param mp + * Pointer to the memory pool. + * @param obj_table_const + * Pointer to a table of void * pointers (objects). + * @param n + * Index of object in object table. + * @param free + * - 0: object is supposed to be allocated, mark it as free + * - 1: object is supposed to be free, mark it as allocated + * - 2: just check that cookie is valid (free or allocated) + */ +void rte_mempool_check_cookies(const struct rte_mempool *mp, + void * const *obj_table_const, unsigned n, int free); + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +#define __mempool_check_cookies(mp, obj_table_const, n, free) \ + rte_mempool_check_cookies(mp, obj_table_const, n, free) +#else +#define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0) +#endif /* RTE_LIBRTE_MEMPOOL_DEBUG */ + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * @internal Check contiguous object blocks and update cookies or panic. + * + * @param mp + * Pointer to the memory pool. + * @param first_obj_table_const + * Pointer to a table of void * pointers (first object of the contiguous + * object blocks). + * @param n + * Number of contiguous object blocks. + * @param free + * - 0: object is supposed to be allocated, mark it as free + * - 1: object is supposed to be free, mark it as allocated + * - 2: just check that cookie is valid (free or allocated) + */ +void rte_mempool_contig_blocks_check_cookies(const struct rte_mempool *mp, + void * const *first_obj_table_const, unsigned int n, int free); + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +#define __mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \ + free) \ + rte_mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \ + free) +#else +#define __mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \ + free) \ + do {} while (0) +#endif /* RTE_LIBRTE_MEMPOOL_DEBUG */ + +#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */ + +/** + * Prototype for implementation specific data provisioning function. + * + * The function should provide the implementation specific memory for + * use by the other mempool ops functions in a given mempool ops struct. + * E.g. the default ops provides an instance of the rte_ring for this purpose. + * it will most likely point to a different type of data structure, and + * will be transparent to the application programmer. + * This function should set mp->pool_data. + */ +typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp); + +/** + * Free the opaque private data pointed to by mp->pool_data pointer. + */ +typedef void (*rte_mempool_free_t)(struct rte_mempool *mp); + +/** + * Enqueue an object into the external pool. + */ +typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp, + void * const *obj_table, unsigned int n); + +/** + * Dequeue an object from the external pool. + */ +typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp, + void **obj_table, unsigned int n); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Dequeue a number of contiquous object blocks from the external pool. + */ +typedef int (*rte_mempool_dequeue_contig_blocks_t)(struct rte_mempool *mp, + void **first_obj_table, unsigned int n); + +/** + * Return the number of available objects in the external pool. + */ +typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp); + +/** + * Calculate memory size required to store given number of objects. + * + * If mempool objects are not required to be IOVA-contiguous + * (the flag MEMPOOL_F_NO_IOVA_CONTIG is set), min_chunk_size defines + * virtually contiguous chunk size. Otherwise, if mempool objects must + * be IOVA-contiguous (the flag MEMPOOL_F_NO_IOVA_CONTIG is clear), + * min_chunk_size defines IOVA-contiguous chunk size. + * + * @param[in] mp + * Pointer to the memory pool. + * @param[in] obj_num + * Number of objects. + * @param[in] pg_shift + * LOG2 of the physical pages size. If set to 0, ignore page boundaries. + * @param[out] min_chunk_size + * Location for minimum size of the memory chunk which may be used to + * store memory pool objects. + * @param[out] align + * Location for required memory chunk alignment. + * @return + * Required memory size aligned at page boundary. + */ +typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp, + uint32_t obj_num, uint32_t pg_shift, + size_t *min_chunk_size, size_t *align); + +/** + * Default way to calculate memory size required to store given number of + * objects. + * + * If page boundaries may be ignored, it is just a product of total + * object size including header and trailer and number of objects. + * Otherwise, it is a number of pages required to store given number of + * objects without crossing page boundary. + * + * Note that if object size is bigger than page size, then it assumes + * that pages are grouped in subsets of physically continuous pages big + * enough to store at least one object. + * + * Minimum size of memory chunk is a maximum of the page size and total + * element size. + * + * Required memory chunk alignment is a maximum of page size and cache + * line size. + */ +ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp, + uint32_t obj_num, uint32_t pg_shift, + size_t *min_chunk_size, size_t *align); + +/** + * Function to be called for each populated object. + * + * @param[in] mp + * A pointer to the mempool structure. + * @param[in] opaque + * An opaque pointer passed to iterator. + * @param[in] vaddr + * Object virtual address. + * @param[in] iova + * Input/output virtual address of the object or RTE_BAD_IOVA. + */ +typedef void (rte_mempool_populate_obj_cb_t)(struct rte_mempool *mp, + void *opaque, void *vaddr, rte_iova_t iova); + +/** + * Populate memory pool objects using provided memory chunk. + * + * Populated objects should be enqueued to the pool, e.g. using + * rte_mempool_ops_enqueue_bulk(). + * + * If the given IO address is unknown (iova = RTE_BAD_IOVA), + * the chunk doesn't need to be physically contiguous (only virtually), + * and allocated objects may span two pages. + * + * @param[in] mp + * A pointer to the mempool structure. + * @param[in] max_objs + * Maximum number of objects to be populated. + * @param[in] vaddr + * The virtual address of memory that should be used to store objects. + * @param[in] iova + * The IO address + * @param[in] len + * The length of memory in bytes. + * @param[in] obj_cb + * Callback function to be executed for each populated object. + * @param[in] obj_cb_arg + * An opaque pointer passed to the callback function. + * @return + * The number of objects added on success. + * On error, no objects are populated and a negative errno is returned. + */ +typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp, + unsigned int max_objs, + void *vaddr, rte_iova_t iova, size_t len, + rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg); + +/** + * Default way to populate memory pool object using provided memory + * chunk: just slice objects one by one. + */ +int rte_mempool_op_populate_default(struct rte_mempool *mp, + unsigned int max_objs, + void *vaddr, rte_iova_t iova, size_t len, + rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Get some additional information about a mempool. + */ +typedef int (*rte_mempool_get_info_t)(const struct rte_mempool *mp, + struct rte_mempool_info *info); + + +/** Structure defining mempool operations structure */ +struct rte_mempool_ops { + char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */ + rte_mempool_alloc_t alloc; /**< Allocate private data. */ + rte_mempool_free_t free; /**< Free the external pool. */ + rte_mempool_enqueue_t enqueue; /**< Enqueue an object. */ + rte_mempool_dequeue_t dequeue; /**< Dequeue an object. */ + rte_mempool_get_count get_count; /**< Get qty of available objs. */ + /** + * Optional callback to calculate memory size required to + * store specified number of objects. + */ + rte_mempool_calc_mem_size_t calc_mem_size; + /** + * Optional callback to populate mempool objects using + * provided memory chunk. + */ + rte_mempool_populate_t populate; + /** + * Get mempool info + */ + rte_mempool_get_info_t get_info; + /** + * Dequeue a number of contiguous object blocks. + */ + rte_mempool_dequeue_contig_blocks_t dequeue_contig_blocks; +} __rte_cache_aligned; + +#define RTE_MEMPOOL_MAX_OPS_IDX 16 /**< Max registered ops structs */ + +/** + * Structure storing the table of registered ops structs, each of which contain + * the function pointers for the mempool ops functions. + * Each process has its own storage for this ops struct array so that + * the mempools can be shared across primary and secondary processes. + * The indices used to access the array are valid across processes, whereas + * any function pointers stored directly in the mempool struct would not be. + * This results in us simply having "ops_index" in the mempool struct. + */ +struct rte_mempool_ops_table { + rte_spinlock_t sl; /**< Spinlock for add/delete. */ + uint32_t num_ops; /**< Number of used ops structs in the table. */ + /** + * Storage for all possible ops structs. + */ + struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX]; +} __rte_cache_aligned; + +/** Array of registered ops structs. */ +extern struct rte_mempool_ops_table rte_mempool_ops_table; + +/** + * @internal Get the mempool ops struct from its index. + * + * @param ops_index + * The index of the ops struct in the ops struct table. It must be a valid + * index: (0 <= idx < num_ops). + * @return + * The pointer to the ops struct in the table. + */ +static inline struct rte_mempool_ops * +rte_mempool_get_ops(int ops_index) +{ + RTE_VERIFY((ops_index >= 0) && (ops_index < RTE_MEMPOOL_MAX_OPS_IDX)); + + return &rte_mempool_ops_table.ops[ops_index]; +} + +/** + * @internal Wrapper for mempool_ops alloc callback. + * + * @param mp + * Pointer to the memory pool. + * @return + * - 0: Success; successfully allocated mempool pool_data. + * - <0: Error; code of alloc function. + */ +int +rte_mempool_ops_alloc(struct rte_mempool *mp); + +/** + * @internal Wrapper for mempool_ops dequeue callback. + * + * @param mp + * Pointer to the memory pool. + * @param obj_table + * Pointer to a table of void * pointers (objects). + * @param n + * Number of objects to get. + * @return + * - 0: Success; got n objects. + * - <0: Error; code of dequeue function. + */ +static inline int +rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp, + void **obj_table, unsigned n) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->dequeue(mp, obj_table, n); +} + +/** + * @internal Wrapper for mempool_ops dequeue_contig_blocks callback. + * + * @param[in] mp + * Pointer to the memory pool. + * @param[out] first_obj_table + * Pointer to a table of void * pointers (first objects). + * @param[in] n + * Number of blocks to get. + * @return + * - 0: Success; got n objects. + * - <0: Error; code of dequeue function. + */ +static inline int +rte_mempool_ops_dequeue_contig_blocks(struct rte_mempool *mp, + void **first_obj_table, unsigned int n) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + RTE_ASSERT(ops->dequeue_contig_blocks != NULL); + return ops->dequeue_contig_blocks(mp, first_obj_table, n); +} + +/** + * @internal wrapper for mempool_ops enqueue callback. + * + * @param mp + * Pointer to the memory pool. + * @param obj_table + * Pointer to a table of void * pointers (objects). + * @param n + * Number of objects to put. + * @return + * - 0: Success; n objects supplied. + * - <0: Error; code of enqueue function. + */ +static inline int +rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table, + unsigned n) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->enqueue(mp, obj_table, n); +} + +/** + * @internal wrapper for mempool_ops get_count callback. + * + * @param mp + * Pointer to the memory pool. + * @return + * The number of available objects in the external pool. + */ +unsigned +rte_mempool_ops_get_count(const struct rte_mempool *mp); + +/** + * @internal wrapper for mempool_ops calc_mem_size callback. + * API to calculate size of memory required to store specified number of + * object. + * + * @param[in] mp + * Pointer to the memory pool. + * @param[in] obj_num + * Number of objects. + * @param[in] pg_shift + * LOG2 of the physical pages size. If set to 0, ignore page boundaries. + * @param[out] min_chunk_size + * Location for minimum size of the memory chunk which may be used to + * store memory pool objects. + * @param[out] align + * Location for required memory chunk alignment. + * @return + * Required memory size aligned at page boundary. + */ +ssize_t rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp, + uint32_t obj_num, uint32_t pg_shift, + size_t *min_chunk_size, size_t *align); + +/** + * @internal wrapper for mempool_ops populate callback. + * + * Populate memory pool objects using provided memory chunk. + * + * @param[in] mp + * A pointer to the mempool structure. + * @param[in] max_objs + * Maximum number of objects to be populated. + * @param[in] vaddr + * The virtual address of memory that should be used to store objects. + * @param[in] iova + * The IO address + * @param[in] len + * The length of memory in bytes. + * @param[in] obj_cb + * Callback function to be executed for each populated object. + * @param[in] obj_cb_arg + * An opaque pointer passed to the callback function. + * @return + * The number of objects added on success. + * On error, no objects are populated and a negative errno is returned. + */ +int rte_mempool_ops_populate(struct rte_mempool *mp, unsigned int max_objs, + void *vaddr, rte_iova_t iova, size_t len, + rte_mempool_populate_obj_cb_t *obj_cb, + void *obj_cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Wrapper for mempool_ops get_info callback. + * + * @param[in] mp + * Pointer to the memory pool. + * @param[out] info + * Pointer to the rte_mempool_info structure + * @return + * - 0: Success; The mempool driver supports retrieving supplementary + * mempool information + * - -ENOTSUP - doesn't support get_info ops (valid case). + */ +__rte_experimental +int rte_mempool_ops_get_info(const struct rte_mempool *mp, + struct rte_mempool_info *info); + +/** + * @internal wrapper for mempool_ops free callback. + * + * @param mp + * Pointer to the memory pool. + */ +void +rte_mempool_ops_free(struct rte_mempool *mp); + +/** + * Set the ops of a mempool. + * + * This can only be done on a mempool that is not populated, i.e. just after + * a call to rte_mempool_create_empty(). + * + * @param mp + * Pointer to the memory pool. + * @param name + * Name of the ops structure to use for this mempool. + * @param pool_config + * Opaque data that can be passed by the application to the ops functions. + * @return + * - 0: Success; the mempool is now using the requested ops functions. + * - -EINVAL - Invalid ops struct name provided. + * - -EEXIST - mempool already has an ops struct assigned. + */ +int +rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name, + void *pool_config); + +/** + * Register mempool operations. + * + * @param ops + * Pointer to an ops structure to register. + * @return + * - >=0: Success; return the index of the ops struct in the table. + * - -EINVAL - some missing callbacks while registering ops struct. + * - -ENOSPC - the maximum number of ops structs has been reached. + */ +int rte_mempool_register_ops(const struct rte_mempool_ops *ops); + +/** + * Macro to statically register the ops of a mempool handler. + * Note that the rte_mempool_register_ops fails silently here when + * more than RTE_MEMPOOL_MAX_OPS_IDX is registered. + */ +#define MEMPOOL_REGISTER_OPS(ops) \ + void mp_hdlr_init_##ops(void); \ + void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\ + { \ + rte_mempool_register_ops(&ops); \ + } + +/** + * An object callback function for mempool. + * + * Used by rte_mempool_create() and rte_mempool_obj_iter(). + */ +typedef void (rte_mempool_obj_cb_t)(struct rte_mempool *mp, + void *opaque, void *obj, unsigned obj_idx); +typedef rte_mempool_obj_cb_t rte_mempool_obj_ctor_t; /* compat */ + +/** + * A memory callback function for mempool. + * + * Used by rte_mempool_mem_iter(). + */ +typedef void (rte_mempool_mem_cb_t)(struct rte_mempool *mp, + void *opaque, struct rte_mempool_memhdr *memhdr, + unsigned mem_idx); + +/** + * A mempool constructor callback function. + * + * Arguments are the mempool and the opaque pointer given by the user in + * rte_mempool_create(). + */ +typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *); + +/** + * Create a new mempool named *name* in memory. + * + * This function uses ``rte_memzone_reserve()`` to allocate memory. The + * pool contains n elements of elt_size. Its size is set to n. + * + * @param name + * The name of the mempool. + * @param n + * The number of elements in the mempool. The optimum size (in terms of + * memory usage) for a mempool is when n is a power of two minus one: + * n = (2^q - 1). + * @param elt_size + * The size of each element. + * @param cache_size + * If cache_size is non-zero, the rte_mempool library will try to + * limit the accesses to the common lockless pool, by maintaining a + * per-lcore object cache. This argument must be lower or equal to + * CONFIG_RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose + * cache_size to have "n modulo cache_size == 0": if this is + * not the case, some elements will always stay in the pool and will + * never be used. The access to the per-lcore table is of course + * faster than the multi-producer/consumer pool. The cache can be + * disabled if the cache_size argument is set to 0; it can be useful to + * avoid losing objects in cache. + * @param private_data_size + * The size of the private data appended after the mempool + * structure. This is useful for storing some private data after the + * mempool structure, as is done for rte_mbuf_pool for example. + * @param mp_init + * A function pointer that is called for initialization of the pool, + * before object initialization. The user can initialize the private + * data in this function if needed. This parameter can be NULL if + * not needed. + * @param mp_init_arg + * An opaque pointer to data that can be used in the mempool + * constructor function. + * @param obj_init + * A function pointer that is called for each object at + * initialization of the pool. The user can set some meta data in + * objects if needed. This parameter can be NULL if not needed. + * The obj_init() function takes the mempool pointer, the init_arg, + * the object pointer and the object number as parameters. + * @param obj_init_arg + * An opaque pointer to data that can be used as an argument for + * each call to the object constructor function. + * @param socket_id + * The *socket_id* argument is the socket identifier in the case of + * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA + * constraint for the reserved zone. + * @param flags + * The *flags* arguments is an OR of following flags: + * - MEMPOOL_F_NO_SPREAD: By default, objects addresses are spread + * between channels in RAM: the pool allocator will add padding + * between objects depending on the hardware configuration. See + * Memory alignment constraints for details. If this flag is set, + * the allocator will just align them to a cache line. + * - MEMPOOL_F_NO_CACHE_ALIGN: By default, the returned objects are + * cache-aligned. This flag removes this constraint, and no + * padding will be present between objects. This flag implies + * MEMPOOL_F_NO_SPREAD. + * - MEMPOOL_F_SP_PUT: If this flag is set, the default behavior + * when using rte_mempool_put() or rte_mempool_put_bulk() is + * "single-producer". Otherwise, it is "multi-producers". + * - MEMPOOL_F_SC_GET: If this flag is set, the default behavior + * when using rte_mempool_get() or rte_mempool_get_bulk() is + * "single-consumer". Otherwise, it is "multi-consumers". + * - MEMPOOL_F_NO_IOVA_CONTIG: If set, allocated objects won't + * necessarily be contiguous in IO memory. + * @return + * The pointer to the new allocated mempool, on success. NULL on error + * with rte_errno set appropriately. Possible rte_errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - cache size provided is too large + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_mempool * +rte_mempool_create(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + rte_mempool_ctor_t *mp_init, void *mp_init_arg, + rte_mempool_obj_cb_t *obj_init, void *obj_init_arg, + int socket_id, unsigned flags); + +/** + * Create an empty mempool + * + * The mempool is allocated and initialized, but it is not populated: no + * memory is allocated for the mempool elements. The user has to call + * rte_mempool_populate_*() to add memory chunks to the pool. Once + * populated, the user may also want to initialize each object with + * rte_mempool_obj_iter(). + * + * @param name + * The name of the mempool. + * @param n + * The maximum number of elements that can be added in the mempool. + * The optimum size (in terms of memory usage) for a mempool is when n + * is a power of two minus one: n = (2^q - 1). + * @param elt_size + * The size of each element. + * @param cache_size + * Size of the cache. See rte_mempool_create() for details. + * @param private_data_size + * The size of the private data appended after the mempool + * structure. This is useful for storing some private data after the + * mempool structure, as is done for rte_mbuf_pool for example. + * @param socket_id + * The *socket_id* argument is the socket identifier in the case of + * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA + * constraint for the reserved zone. + * @param flags + * Flags controlling the behavior of the mempool. See + * rte_mempool_create() for details. + * @return + * The pointer to the new allocated mempool, on success. NULL on error + * with rte_errno set appropriately. See rte_mempool_create() for details. + */ +struct rte_mempool * +rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size, + unsigned cache_size, unsigned private_data_size, + int socket_id, unsigned flags); +/** + * Free a mempool + * + * Unlink the mempool from global list, free the memory chunks, and all + * memory referenced by the mempool. The objects must not be used by + * other cores as they will be freed. + * + * @param mp + * A pointer to the mempool structure. + */ +void +rte_mempool_free(struct rte_mempool *mp); + +/** + * Add physically contiguous memory for objects in the pool at init + * + * Add a virtually and physically contiguous memory chunk in the pool + * where objects can be instantiated. + * + * If the given IO address is unknown (iova = RTE_BAD_IOVA), + * the chunk doesn't need to be physically contiguous (only virtually), + * and allocated objects may span two pages. + * + * @param mp + * A pointer to the mempool structure. + * @param vaddr + * The virtual address of memory that should be used to store objects. + * @param iova + * The IO address + * @param len + * The length of memory in bytes. + * @param free_cb + * The callback used to free this chunk when destroying the mempool. + * @param opaque + * An opaque argument passed to free_cb. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, + rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque); + +/** + * Add virtually contiguous memory for objects in the pool at init + * + * Add a virtually contiguous memory chunk in the pool where objects can + * be instantiated. + * + * @param mp + * A pointer to the mempool structure. + * @param addr + * The virtual address of memory that should be used to store objects. + * Must be page-aligned. + * @param len + * The length of memory in bytes. Must be page-aligned. + * @param pg_sz + * The size of memory pages in this virtual area. + * @param free_cb + * The callback used to free this chunk when destroying the mempool. + * @param opaque + * An opaque argument passed to free_cb. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int +rte_mempool_populate_virt(struct rte_mempool *mp, char *addr, + size_t len, size_t pg_sz, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque); + +/** + * Add memory for objects in the pool at init + * + * This is the default function used by rte_mempool_create() to populate + * the mempool. It adds memory allocated using rte_memzone_reserve(). + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_default(struct rte_mempool *mp); + +/** + * Add memory from anonymous mapping for objects in the pool at init + * + * This function mmap an anonymous memory zone that is locked in + * memory to store the objects of the mempool. + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of objects added on success. + * On error, the chunk is not added in the memory list of the + * mempool and a negative errno is returned. + */ +int rte_mempool_populate_anon(struct rte_mempool *mp); + +/** + * Call a function for each mempool element + * + * Iterate across all objects attached to a rte_mempool and call the + * callback function on it. + * + * @param mp + * A pointer to an initialized mempool. + * @param obj_cb + * A function pointer that is called for each object. + * @param obj_cb_arg + * An opaque pointer passed to the callback function. + * @return + * Number of objects iterated. + */ +uint32_t rte_mempool_obj_iter(struct rte_mempool *mp, + rte_mempool_obj_cb_t *obj_cb, void *obj_cb_arg); + +/** + * Call a function for each mempool memory chunk + * + * Iterate across all memory chunks attached to a rte_mempool and call + * the callback function on it. + * + * @param mp + * A pointer to an initialized mempool. + * @param mem_cb + * A function pointer that is called for each memory chunk. + * @param mem_cb_arg + * An opaque pointer passed to the callback function. + * @return + * Number of memory chunks iterated. + */ +uint32_t rte_mempool_mem_iter(struct rte_mempool *mp, + rte_mempool_mem_cb_t *mem_cb, void *mem_cb_arg); + +/** + * Dump the status of the mempool to a file. + * + * @param f + * A pointer to a file for output + * @param mp + * A pointer to the mempool structure. + */ +void rte_mempool_dump(FILE *f, struct rte_mempool *mp); + +/** + * Create a user-owned mempool cache. + * + * This can be used by non-EAL threads to enable caching when they + * interact with a mempool. + * + * @param size + * The size of the mempool cache. See rte_mempool_create()'s cache_size + * parameter description for more information. The same limits and + * considerations apply here too. + * @param socket_id + * The socket identifier in the case of NUMA. The value can be + * SOCKET_ID_ANY if there is no NUMA constraint for the reserved zone. + */ +struct rte_mempool_cache * +rte_mempool_cache_create(uint32_t size, int socket_id); + +/** + * Free a user-owned mempool cache. + * + * @param cache + * A pointer to the mempool cache. + */ +void +rte_mempool_cache_free(struct rte_mempool_cache *cache); + +/** + * Get a pointer to the per-lcore default mempool cache. + * + * @param mp + * A pointer to the mempool structure. + * @param lcore_id + * The logical core id. + * @return + * A pointer to the mempool cache or NULL if disabled or non-EAL thread. + */ +static __rte_always_inline struct rte_mempool_cache * +rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id) +{ + if (mp->cache_size == 0) + return NULL; + + if (lcore_id >= RTE_MAX_LCORE) + return NULL; + + return &mp->local_cache[lcore_id]; +} + +/** + * Flush a user-owned mempool cache to the specified mempool. + * + * @param cache + * A pointer to the mempool cache. + * @param mp + * A pointer to the mempool. + */ +static __rte_always_inline void +rte_mempool_cache_flush(struct rte_mempool_cache *cache, + struct rte_mempool *mp) +{ + if (cache == NULL) + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + if (cache == NULL || cache->len == 0) + return; + rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len); + cache->len = 0; +} + +/** + * @internal Put several objects back in the mempool; used internally. + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to store back in the mempool, must be strictly + * positive. + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + */ +static __rte_always_inline void +__mempool_generic_put(struct rte_mempool *mp, void * const *obj_table, + unsigned int n, struct rte_mempool_cache *cache) +{ + void **cache_objs; + + /* increment stat now, adding in mempool always success */ + __MEMPOOL_STAT_ADD(mp, put, n); + + /* No cache provided or if put would overflow mem allocated for cache */ + if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE)) + goto ring_enqueue; + + cache_objs = &cache->objs[cache->len]; + + /* + * The cache follows the following algorithm + * 1. Add the objects to the cache + * 2. Anything greater than the cache min value (if it crosses the + * cache flush threshold) is flushed to the ring. + */ + + /* Add elements back into the cache */ + rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n); + + cache->len += n; + + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; + } + + return; + +ring_enqueue: + + /* push remaining objects in ring */ +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG + if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0) + rte_panic("cannot put objects in mempool\n"); +#else + rte_mempool_ops_enqueue_bulk(mp, obj_table, n); +#endif +} + + +/** + * Put several objects back in the mempool. + * + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the mempool from the obj_table. + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + */ +static __rte_always_inline void +rte_mempool_generic_put(struct rte_mempool *mp, void * const *obj_table, + unsigned int n, struct rte_mempool_cache *cache) +{ + __mempool_check_cookies(mp, obj_table, n, 0); + __mempool_generic_put(mp, obj_table, n, cache); +} + +/** + * Put several objects back in the mempool. + * + * This function calls the multi-producer or the single-producer + * version depending on the default behavior that was specified at + * mempool creation time (see flags). + * + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the mempool from obj_table. + */ +static __rte_always_inline void +rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table, + unsigned int n) +{ + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + rte_mempool_generic_put(mp, obj_table, n, cache); +} + +/** + * Put one object back in the mempool. + * + * This function calls the multi-producer or the single-producer + * version depending on the default behavior that was specified at + * mempool creation time (see flags). + * + * @param mp + * A pointer to the mempool structure. + * @param obj + * A pointer to the object to be added. + */ +static __rte_always_inline void +rte_mempool_put(struct rte_mempool *mp, void *obj) +{ + rte_mempool_put_bulk(mp, &obj, 1); +} + +/** + * @internal Get several objects from the mempool; used internally. + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to get, must be strictly positive. + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + * @return + * - >=0: Success; number of objects supplied. + * - <0: Error; code of ring dequeue function. + */ +static __rte_always_inline int +__mempool_generic_get(struct rte_mempool *mp, void **obj_table, + unsigned int n, struct rte_mempool_cache *cache) +{ + int ret; + uint32_t index, len; + void **cache_objs; + + /* No cache provided or cannot be satisfied from cache */ + if (unlikely(cache == NULL || n >= cache->size)) + goto ring_dequeue; + + cache_objs = cache->objs; + + /* Can this be satisfied from the cache? */ + if (cache->len < n) { + /* No. Backfill the cache first, and then fill from it */ + uint32_t req = n + (cache->size - cache->len); + + /* How many do we require i.e. number to fill the cache + the request */ + ret = rte_mempool_ops_dequeue_bulk(mp, + &cache->objs[cache->len], req); + if (unlikely(ret < 0)) { + /* + * In the offchance that we are buffer constrained, + * where we are not able to allocate cache + n, go to + * the ring directly. If that fails, we are truly out of + * buffers. + */ + goto ring_dequeue; + } + + cache->len += req; + } + + /* Now fill in the response ... */ + for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++) + *obj_table = cache_objs[len]; + + cache->len -= n; + + __MEMPOOL_STAT_ADD(mp, get_success, n); + + return 0; + +ring_dequeue: + + /* get remaining objects from ring */ + ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n); + + if (ret < 0) + __MEMPOOL_STAT_ADD(mp, get_fail, n); + else + __MEMPOOL_STAT_ADD(mp, get_success, n); + + return ret; +} + +/** + * Get several objects from the mempool. + * + * If cache is enabled, objects will be retrieved first from cache, + * subsequently from the common pool. Note that it can return -ENOENT when + * the local cache and common pool are empty, even if cache from other + * lcores are full. + * + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to get from mempool to obj_table. + * @param cache + * A pointer to a mempool cache structure. May be NULL if not needed. + * @return + * - 0: Success; objects taken. + * - -ENOENT: Not enough entries in the mempool; no object is retrieved. + */ +static __rte_always_inline int +rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table, + unsigned int n, struct rte_mempool_cache *cache) +{ + int ret; + ret = __mempool_generic_get(mp, obj_table, n, cache); + if (ret == 0) + __mempool_check_cookies(mp, obj_table, n, 1); + return ret; +} + +/** + * Get several objects from the mempool. + * + * This function calls the multi-consumers or the single-consumer + * version, depending on the default behaviour that was specified at + * mempool creation time (see flags). + * + * If cache is enabled, objects will be retrieved first from cache, + * subsequently from the common pool. Note that it can return -ENOENT when + * the local cache and common pool are empty, even if cache from other + * lcores are full. + * + * @param mp + * A pointer to the mempool structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to get from the mempool to obj_table. + * @return + * - 0: Success; objects taken + * - -ENOENT: Not enough entries in the mempool; no object is retrieved. + */ +static __rte_always_inline int +rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned int n) +{ + struct rte_mempool_cache *cache; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + return rte_mempool_generic_get(mp, obj_table, n, cache); +} + +/** + * Get one object from the mempool. + * + * This function calls the multi-consumers or the single-consumer + * version, depending on the default behavior that was specified at + * mempool creation (see flags). + * + * If cache is enabled, objects will be retrieved first from cache, + * subsequently from the common pool. Note that it can return -ENOENT when + * the local cache and common pool are empty, even if cache from other + * lcores are full. + * + * @param mp + * A pointer to the mempool structure. + * @param obj_p + * A pointer to a void * pointer (object) that will be filled. + * @return + * - 0: Success; objects taken. + * - -ENOENT: Not enough entries in the mempool; no object is retrieved. + */ +static __rte_always_inline int +rte_mempool_get(struct rte_mempool *mp, void **obj_p) +{ + return rte_mempool_get_bulk(mp, obj_p, 1); +} + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Get a contiguous blocks of objects from the mempool. + * + * If cache is enabled, consider to flush it first, to reuse objects + * as soon as possible. + * + * The application should check that the driver supports the operation + * by calling rte_mempool_ops_get_info() and checking that `contig_block_size` + * is not zero. + * + * @param mp + * A pointer to the mempool structure. + * @param first_obj_table + * A pointer to a pointer to the first object in each block. + * @param n + * The number of blocks to get from mempool. + * @return + * - 0: Success; blocks taken. + * - -ENOBUFS: Not enough entries in the mempool; no object is retrieved. + * - -EOPNOTSUPP: The mempool driver does not support block dequeue + */ +static __rte_always_inline int +__rte_experimental +rte_mempool_get_contig_blocks(struct rte_mempool *mp, + void **first_obj_table, unsigned int n) +{ + int ret; + + ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table, n); + if (ret == 0) { + __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, get_success, n); + __mempool_contig_blocks_check_cookies(mp, first_obj_table, n, + 1); + } else { + __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, get_fail, n); + } + + return ret; +} + +/** + * Return the number of entries in the mempool. + * + * When cache is enabled, this function has to browse the length of + * all lcores, so it should not be used in a data path, but only for + * debug purposes. User-owned mempool caches are not accounted for. + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of entries in the mempool. + */ +unsigned int rte_mempool_avail_count(const struct rte_mempool *mp); + +/** + * Return the number of elements which have been allocated from the mempool + * + * When cache is enabled, this function has to browse the length of + * all lcores, so it should not be used in a data path, but only for + * debug purposes. + * + * @param mp + * A pointer to the mempool structure. + * @return + * The number of free entries in the mempool. + */ +unsigned int +rte_mempool_in_use_count(const struct rte_mempool *mp); + +/** + * Test if the mempool is full. + * + * When cache is enabled, this function has to browse the length of all + * lcores, so it should not be used in a data path, but only for debug + * purposes. User-owned mempool caches are not accounted for. + * + * @param mp + * A pointer to the mempool structure. + * @return + * - 1: The mempool is full. + * - 0: The mempool is not full. + */ +static inline int +rte_mempool_full(const struct rte_mempool *mp) +{ + return !!(rte_mempool_avail_count(mp) == mp->size); +} + +/** + * Test if the mempool is empty. + * + * When cache is enabled, this function has to browse the length of all + * lcores, so it should not be used in a data path, but only for debug + * purposes. User-owned mempool caches are not accounted for. + * + * @param mp + * A pointer to the mempool structure. + * @return + * - 1: The mempool is empty. + * - 0: The mempool is not empty. + */ +static inline int +rte_mempool_empty(const struct rte_mempool *mp) +{ + return !!(rte_mempool_avail_count(mp) == 0); +} + +/** + * Return the IO address of elt, which is an element of the pool mp. + * + * @param elt + * A pointer (virtual address) to the element of the pool. + * @return + * The IO address of the elt element. + * If the mempool was created with MEMPOOL_F_NO_IOVA_CONTIG, the + * returned value is RTE_BAD_IOVA. + */ +static inline rte_iova_t +rte_mempool_virt2iova(const void *elt) +{ + const struct rte_mempool_objhdr *hdr; + hdr = (const struct rte_mempool_objhdr *)RTE_PTR_SUB(elt, + sizeof(*hdr)); + return hdr->iova; +} + +/** + * Check the consistency of mempool objects. + * + * Verify the coherency of fields in the mempool structure. Also check + * that the cookies of mempool objects (even the ones that are not + * present in pool) have a correct value. If not, a panic will occur. + * + * @param mp + * A pointer to the mempool structure. + */ +void rte_mempool_audit(struct rte_mempool *mp); + +/** + * Return a pointer to the private data in an mempool structure. + * + * @param mp + * A pointer to the mempool structure. + * @return + * A pointer to the private data. + */ +static inline void *rte_mempool_get_priv(struct rte_mempool *mp) +{ + return (char *)mp + + MEMPOOL_HEADER_SIZE(mp, mp->cache_size); +} + +/** + * Dump the status of all mempools on the console + * + * @param f + * A pointer to a file for output + */ +void rte_mempool_list_dump(FILE *f); + +/** + * Search a mempool from its name + * + * @param name + * The name of the mempool. + * @return + * The pointer to the mempool matching the name, or NULL if not found. + * NULL on error + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + * + */ +struct rte_mempool *rte_mempool_lookup(const char *name); + +/** + * Get the header, trailer and total size of a mempool element. + * + * Given a desired size of the mempool element and mempool flags, + * calculates header, trailer, body and total sizes of the mempool object. + * + * @param elt_size + * The size of each element, without header and trailer. + * @param flags + * The flags used for the mempool creation. + * Consult rte_mempool_create() for more information about possible values. + * The size of each element. + * @param sz + * The calculated detailed size the mempool object. May be NULL. + * @return + * Total size of the mempool object. + */ +uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags, + struct rte_mempool_objsz *sz); + +/** + * Walk list of all memory pools + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + */ +void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg), + void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMPOOL_H_ */ diff --git a/src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops.c b/src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops.c new file mode 100644 index 00000000..a27e1fa5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 6WIND S.A. + */ + +#include +#include + +#include +#include +#include + +/* indirect jump table to support external memory pools. */ +struct rte_mempool_ops_table rte_mempool_ops_table = { + .sl = RTE_SPINLOCK_INITIALIZER, + .num_ops = 0 +}; + +/* add a new ops struct in rte_mempool_ops_table, return its index. */ +int +rte_mempool_register_ops(const struct rte_mempool_ops *h) +{ + struct rte_mempool_ops *ops; + int16_t ops_index; + + rte_spinlock_lock(&rte_mempool_ops_table.sl); + + if (rte_mempool_ops_table.num_ops >= + RTE_MEMPOOL_MAX_OPS_IDX) { + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + RTE_LOG(ERR, MEMPOOL, + "Maximum number of mempool ops structs exceeded\n"); + return -ENOSPC; + } + + if (h->alloc == NULL || h->enqueue == NULL || + h->dequeue == NULL || h->get_count == NULL) { + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + RTE_LOG(ERR, MEMPOOL, + "Missing callback while registering mempool ops\n"); + return -EINVAL; + } + + if (strlen(h->name) >= sizeof(ops->name) - 1) { + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n", + __func__, h->name); + rte_errno = EEXIST; + return -EEXIST; + } + + ops_index = rte_mempool_ops_table.num_ops++; + ops = &rte_mempool_ops_table.ops[ops_index]; + snprintf(ops->name, sizeof(ops->name), "%s", h->name); + ops->alloc = h->alloc; + ops->free = h->free; + ops->enqueue = h->enqueue; + ops->dequeue = h->dequeue; + ops->get_count = h->get_count; + ops->calc_mem_size = h->calc_mem_size; + ops->populate = h->populate; + ops->get_info = h->get_info; + ops->dequeue_contig_blocks = h->dequeue_contig_blocks; + + rte_spinlock_unlock(&rte_mempool_ops_table.sl); + + return ops_index; +} + +/* wrapper to allocate an external mempool's private (pool) data. */ +int +rte_mempool_ops_alloc(struct rte_mempool *mp) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->alloc(mp); +} + +/* wrapper to free an external pool ops. */ +void +rte_mempool_ops_free(struct rte_mempool *mp) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + if (ops->free == NULL) + return; + ops->free(mp); +} + +/* wrapper to get available objects in an external mempool. */ +unsigned int +rte_mempool_ops_get_count(const struct rte_mempool *mp) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + return ops->get_count(mp); +} + +/* wrapper to notify new memory area to external mempool */ +ssize_t +rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp, + uint32_t obj_num, uint32_t pg_shift, + size_t *min_chunk_size, size_t *align) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + + if (ops->calc_mem_size == NULL) + return rte_mempool_op_calc_mem_size_default(mp, obj_num, + pg_shift, min_chunk_size, align); + + return ops->calc_mem_size(mp, obj_num, pg_shift, min_chunk_size, align); +} + +/* wrapper to populate memory pool objects using provided memory chunk */ +int +rte_mempool_ops_populate(struct rte_mempool *mp, unsigned int max_objs, + void *vaddr, rte_iova_t iova, size_t len, + rte_mempool_populate_obj_cb_t *obj_cb, + void *obj_cb_arg) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + + if (ops->populate == NULL) + return rte_mempool_op_populate_default(mp, max_objs, vaddr, + iova, len, obj_cb, + obj_cb_arg); + + return ops->populate(mp, max_objs, vaddr, iova, len, obj_cb, + obj_cb_arg); +} + +/* wrapper to get additional mempool info */ +int +rte_mempool_ops_get_info(const struct rte_mempool *mp, + struct rte_mempool_info *info) +{ + struct rte_mempool_ops *ops; + + ops = rte_mempool_get_ops(mp->ops_index); + + RTE_FUNC_PTR_OR_ERR_RET(ops->get_info, -ENOTSUP); + return ops->get_info(mp, info); +} + + +/* sets mempool ops previously registered by rte_mempool_register_ops. */ +int +rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name, + void *pool_config) +{ + struct rte_mempool_ops *ops = NULL; + unsigned i; + + /* too late, the mempool is already populated. */ + if (mp->flags & MEMPOOL_F_POOL_CREATED) + return -EEXIST; + + for (i = 0; i < rte_mempool_ops_table.num_ops; i++) { + if (!strcmp(name, + rte_mempool_ops_table.ops[i].name)) { + ops = &rte_mempool_ops_table.ops[i]; + break; + } + } + + if (ops == NULL) + return -EINVAL; + + mp->ops_index = i; + mp->pool_config = pool_config; + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops_default.c b/src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops_default.c new file mode 100644 index 00000000..4e2bfc82 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/rte_mempool_ops_default.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 6WIND S.A. + * Copyright(c) 2018 Solarflare Communications Inc. + */ + +#include + +ssize_t +rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp, + uint32_t obj_num, uint32_t pg_shift, + size_t *min_chunk_size, size_t *align) +{ + size_t total_elt_sz; + size_t obj_per_page, pg_num, pg_sz; + size_t mem_size; + + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; + if (total_elt_sz == 0) { + mem_size = 0; + } else if (pg_shift == 0) { + mem_size = total_elt_sz * obj_num; + } else { + pg_sz = (size_t)1 << pg_shift; + obj_per_page = pg_sz / total_elt_sz; + if (obj_per_page == 0) { + /* + * Note that if object size is bigger than page size, + * then it is assumed that pages are grouped in subsets + * of physically continuous pages big enough to store + * at least one object. + */ + mem_size = + RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num; + } else { + pg_num = (obj_num + obj_per_page - 1) / obj_per_page; + mem_size = pg_num << pg_shift; + } + } + + *min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz); + + *align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift); + + return mem_size; +} + +int +rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs, + void *vaddr, rte_iova_t iova, size_t len, + rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg) +{ + size_t total_elt_sz; + size_t off; + unsigned int i; + void *obj; + + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; + + for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) { + off += mp->header_size; + obj = (char *)vaddr + off; + obj_cb(mp, obj_cb_arg, obj, + (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off)); + rte_mempool_ops_enqueue_bulk(mp, &obj, 1); + off += mp->elt_size + mp->trailer_size; + } + + return i; +} diff --git a/src/spdk/dpdk/lib/librte_mempool/rte_mempool_version.map b/src/spdk/dpdk/lib/librte_mempool/rte_mempool_version.map new file mode 100644 index 00000000..17cbca46 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_mempool/rte_mempool_version.map @@ -0,0 +1,60 @@ +DPDK_2.0 { + global: + + rte_mempool_audit; + rte_mempool_calc_obj_size; + rte_mempool_create; + rte_mempool_dump; + rte_mempool_list_dump; + rte_mempool_lookup; + rte_mempool_walk; + + local: *; +}; + +DPDK_16.07 { + global: + + rte_mempool_avail_count; + rte_mempool_cache_create; + rte_mempool_cache_flush; + rte_mempool_cache_free; + rte_mempool_check_cookies; + rte_mempool_create_empty; + rte_mempool_default_cache; + rte_mempool_free; + rte_mempool_generic_get; + rte_mempool_generic_put; + rte_mempool_in_use_count; + rte_mempool_mem_iter; + rte_mempool_obj_iter; + rte_mempool_ops_table; + rte_mempool_populate_anon; + rte_mempool_populate_default; + rte_mempool_populate_virt; + rte_mempool_register_ops; + rte_mempool_set_ops_byname; + +} DPDK_2.0; + +DPDK_17.11 { + global: + + rte_mempool_populate_iova; + +} DPDK_16.07; + +DPDK_18.05 { + global: + + rte_mempool_contig_blocks_check_cookies; + rte_mempool_op_calc_mem_size_default; + rte_mempool_op_populate_default; + +} DPDK_17.11; + +EXPERIMENTAL { + global: + + rte_mempool_ops_get_info; +}; diff --git a/src/spdk/dpdk/lib/librte_meter/Makefile b/src/spdk/dpdk/lib/librte_meter/Makefile new file mode 100644 index 00000000..2dc071e8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_meter/Makefile @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_meter.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +LDLIBS += -lm +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_meter_version.map + +LIBABIVER := 2 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_METER) := rte_meter.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_METER)-include := rte_meter.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_meter/meson.build b/src/spdk/dpdk/lib/librte_meter/meson.build new file mode 100644 index 00000000..947bc19e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_meter/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('rte_meter.c') +headers = files('rte_meter.h') diff --git a/src/spdk/dpdk/lib/librte_meter/rte_meter.c b/src/spdk/dpdk/lib/librte_meter/rte_meter.c new file mode 100644 index 00000000..473f69ab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_meter/rte_meter.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include + +#include +#include +#include + +#include "rte_meter.h" + +#ifndef RTE_METER_TB_PERIOD_MIN +#define RTE_METER_TB_PERIOD_MIN 100 +#endif + +static void +rte_meter_get_tb_params(uint64_t hz, uint64_t rate, uint64_t *tb_period, uint64_t *tb_bytes_per_period) +{ + double period = ((double) hz) / ((double) rate); + + if (period >= RTE_METER_TB_PERIOD_MIN) { + *tb_bytes_per_period = 1; + *tb_period = (uint64_t) period; + } else { + *tb_bytes_per_period = (uint64_t) ceil(RTE_METER_TB_PERIOD_MIN / period); + *tb_period = (hz * (*tb_bytes_per_period)) / rate; + } +} + +int +rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, + struct rte_meter_srtcm_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir == 0) || + ((params->cbs == 0) && (params->ebs == 0))) + return -EINVAL; + + /* Initialize srTCM run-time structure */ + p->cbs = params->cbs; + p->ebs = params->ebs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + + return 0; +} + +int +rte_meter_srtcm_config(struct rte_meter_srtcm *m, + struct rte_meter_srtcm_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize srTCM run-time structure */ + m->time = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->te = p->ebs; + + return 0; +} + +int +rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params) +{ + uint64_t hz = rte_get_tsc_hz(); + + /* Check input parameters */ + if ((p == NULL) || + (params == NULL) || + (params->cir == 0) || + (params->pir == 0) || + (params->pir < params->cir) || + (params->cbs == 0) || + (params->pbs == 0)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + p->cbs = params->cbs; + p->pbs = params->pbs; + rte_meter_get_tb_params(hz, params->cir, &p->cir_period, + &p->cir_bytes_per_period); + rte_meter_get_tb_params(hz, params->pir, &p->pir_period, + &p->pir_bytes_per_period); + + return 0; +} + +int +rte_meter_trtcm_config(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p) +{ + /* Check input parameters */ + if ((m == NULL) || (p == NULL)) + return -EINVAL; + + /* Initialize trTCM run-time structure */ + m->time_tc = m->time_tp = rte_get_tsc_cycles(); + m->tc = p->cbs; + m->tp = p->pbs; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_meter/rte_meter.h b/src/spdk/dpdk/lib/librte_meter/rte_meter.h new file mode 100644 index 00000000..58a05158 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_meter/rte_meter.h @@ -0,0 +1,441 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_METER_H__ +#define __INCLUDE_RTE_METER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Traffic Metering + * + * Traffic metering algorithms: + * 1. Single Rate Three Color Marker (srTCM): defined by IETF RFC 2697 + * 2. Two Rate Three Color Marker (trTCM): defined by IETF RFC 2698 + * + ***/ + +#include + +/* + * Application Programmer's Interface (API) + * + ***/ + +/** Packet Color Set */ +enum rte_meter_color { + e_RTE_METER_GREEN = 0, /**< Green */ + e_RTE_METER_YELLOW, /**< Yellow */ + e_RTE_METER_RED, /**< Red */ + e_RTE_METER_COLORS /**< Number of available colors */ +}; + +/** srTCM parameters per metered traffic flow. The CIR, CBS and EBS parameters only +count bytes of IP packets and do not include link specific headers. At least one of +the CBS or EBS parameters has to be greater than zero. */ +struct rte_meter_srtcm_params { + uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in bytes. */ + uint64_t ebs; /**< Excess Burst Size (EBS). Measured in bytes. */ +}; + +/** trTCM parameters per metered traffic flow. The CIR, PIR, CBS and PBS parameters +only count bytes of IP packets and do not include link specific headers. PIR has to +be greater than or equal to CIR. Both CBS or EBS have to be greater than zero. */ +struct rte_meter_trtcm_params { + uint64_t cir; /**< Committed Information Rate (CIR). Measured in bytes per second. */ + uint64_t pir; /**< Peak Information Rate (PIR). Measured in bytes per second. */ + uint64_t cbs; /**< Committed Burst Size (CBS). Measured in byes. */ + uint64_t pbs; /**< Peak Burst Size (PBS). Measured in bytes. */ +}; + +/** + * Internal data structure storing the srTCM configuration profile. Typically + * shared by multiple srTCM objects. + */ +struct rte_meter_srtcm_profile; + +/** + * Internal data structure storing the trTCM configuration profile. Typically + * shared by multiple trTCM objects. + */ +struct rte_meter_trtcm_profile; + +/** Internal data structure storing the srTCM run-time context per metered traffic flow. */ +struct rte_meter_srtcm; + +/** Internal data structure storing the trTCM run-time context per metered traffic flow. */ +struct rte_meter_trtcm; + +/** + * srTCM profile configuration + * + * @param p + * Pointer to pre-allocated srTCM profile data structure + * @param params + * srTCM profile parameters + * @return + * 0 upon success, error code otherwise + */ +int +rte_meter_srtcm_profile_config(struct rte_meter_srtcm_profile *p, + struct rte_meter_srtcm_params *params); + +/** + * trTCM profile configuration + * + * @param p + * Pointer to pre-allocated trTCM profile data structure + * @param params + * trTCM profile parameters + * @return + * 0 upon success, error code otherwise + */ +int +rte_meter_trtcm_profile_config(struct rte_meter_trtcm_profile *p, + struct rte_meter_trtcm_params *params); + +/** + * srTCM configuration per metered traffic flow + * + * @param m + * Pointer to pre-allocated srTCM data structure + * @param p + * srTCM profile. Needs to be valid. + * @return + * 0 upon success, error code otherwise + */ +int +rte_meter_srtcm_config(struct rte_meter_srtcm *m, + struct rte_meter_srtcm_profile *p); + +/** + * trTCM configuration per metered traffic flow + * + * @param m + * Pointer to pre-allocated trTCM data structure + * @param p + * trTCM profile. Needs to be valid. + * @return + * 0 upon success, error code otherwise + */ +int +rte_meter_trtcm_config(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p); + +/** + * srTCM color blind traffic metering + * + * @param m + * Handle to srTCM instance + * @param p + * srTCM profile specified at srTCM object creation time + * @param time + * Current CPU time stamp (measured in CPU cycles) + * @param pkt_len + * Length of the current IP packet (measured in bytes) + * @return + * Color assigned to the current IP packet + */ +static inline enum rte_meter_color +rte_meter_srtcm_color_blind_check(struct rte_meter_srtcm *m, + struct rte_meter_srtcm_profile *p, + uint64_t time, + uint32_t pkt_len); + +/** + * srTCM color aware traffic metering + * + * @param m + * Handle to srTCM instance + * @param p + * srTCM profile specified at srTCM object creation time + * @param time + * Current CPU time stamp (measured in CPU cycles) + * @param pkt_len + * Length of the current IP packet (measured in bytes) + * @param pkt_color + * Input color of the current IP packet + * @return + * Color assigned to the current IP packet + */ +static inline enum rte_meter_color +rte_meter_srtcm_color_aware_check(struct rte_meter_srtcm *m, + struct rte_meter_srtcm_profile *p, + uint64_t time, + uint32_t pkt_len, + enum rte_meter_color pkt_color); + +/** + * trTCM color blind traffic metering + * + * @param m + * Handle to trTCM instance + * @param p + * trTCM profile specified at trTCM object creation time + * @param time + * Current CPU time stamp (measured in CPU cycles) + * @param pkt_len + * Length of the current IP packet (measured in bytes) + * @return + * Color assigned to the current IP packet + */ +static inline enum rte_meter_color +rte_meter_trtcm_color_blind_check(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p, + uint64_t time, + uint32_t pkt_len); + +/** + * trTCM color aware traffic metering + * + * @param m + * Handle to trTCM instance + * @param p + * trTCM profile specified at trTCM object creation time + * @param time + * Current CPU time stamp (measured in CPU cycles) + * @param pkt_len + * Length of the current IP packet (measured in bytes) + * @param pkt_color + * Input color of the current IP packet + * @return + * Color assigned to the current IP packet + */ +static inline enum rte_meter_color +rte_meter_trtcm_color_aware_check(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p, + uint64_t time, + uint32_t pkt_len, + enum rte_meter_color pkt_color); + +/* + * Inline implementation of run-time methods + * + ***/ + +struct rte_meter_srtcm_profile { + uint64_t cbs; + /**< Upper limit for C token bucket */ + uint64_t ebs; + /**< Upper limit for E token bucket */ + uint64_t cir_period; + /**< Number of CPU cycles for each update of C and E token buckets */ + uint64_t cir_bytes_per_period; + /**< Number of bytes to add to C and E token buckets on each update */ +}; + +/* Internal data structure storing the srTCM run-time context per metered traffic flow. */ +struct rte_meter_srtcm { + uint64_t time; /* Time of latest update of C and E token buckets */ + uint64_t tc; /* Number of bytes currently available in the committed (C) token bucket */ + uint64_t te; /* Number of bytes currently available in the excess (E) token bucket */ +}; + +struct rte_meter_trtcm_profile { + uint64_t cbs; + /**< Upper limit for C token bucket */ + uint64_t pbs; + /**< Upper limit for P token bucket */ + uint64_t cir_period; + /**< Number of CPU cycles for one update of C token bucket */ + uint64_t cir_bytes_per_period; + /**< Number of bytes to add to C token bucket on each update */ + uint64_t pir_period; + /**< Number of CPU cycles for one update of P token bucket */ + uint64_t pir_bytes_per_period; + /**< Number of bytes to add to P token bucket on each update */ +}; + +/** + * Internal data structure storing the trTCM run-time context per metered + * traffic flow. + */ +struct rte_meter_trtcm { + uint64_t time_tc; + /**< Time of latest update of C token bucket */ + uint64_t time_tp; + /**< Time of latest update of E token bucket */ + uint64_t tc; + /**< Number of bytes currently available in committed(C) token bucket */ + uint64_t tp; + /**< Number of bytes currently available in the peak(P) token bucket */ +}; + +static inline enum rte_meter_color +rte_meter_srtcm_color_blind_check(struct rte_meter_srtcm *m, + struct rte_meter_srtcm_profile *p, + uint64_t time, + uint32_t pkt_len) +{ + uint64_t time_diff, n_periods, tc, te; + + /* Bucket update */ + time_diff = time - m->time; + n_periods = time_diff / p->cir_period; + m->time += n_periods * p->cir_period; + + /* Put the tokens overflowing from tc into te bucket */ + tc = m->tc + n_periods * p->cir_bytes_per_period; + te = m->te; + if (tc > p->cbs) { + te += (tc - p->cbs); + if (te > p->ebs) + te = p->ebs; + tc = p->cbs; + } + + /* Color logic */ + if (tc >= pkt_len) { + m->tc = tc - pkt_len; + m->te = te; + return e_RTE_METER_GREEN; + } + + if (te >= pkt_len) { + m->tc = tc; + m->te = te - pkt_len; + return e_RTE_METER_YELLOW; + } + + m->tc = tc; + m->te = te; + return e_RTE_METER_RED; +} + +static inline enum rte_meter_color +rte_meter_srtcm_color_aware_check(struct rte_meter_srtcm *m, + struct rte_meter_srtcm_profile *p, + uint64_t time, + uint32_t pkt_len, + enum rte_meter_color pkt_color) +{ + uint64_t time_diff, n_periods, tc, te; + + /* Bucket update */ + time_diff = time - m->time; + n_periods = time_diff / p->cir_period; + m->time += n_periods * p->cir_period; + + /* Put the tokens overflowing from tc into te bucket */ + tc = m->tc + n_periods * p->cir_bytes_per_period; + te = m->te; + if (tc > p->cbs) { + te += (tc - p->cbs); + if (te > p->ebs) + te = p->ebs; + tc = p->cbs; + } + + /* Color logic */ + if ((pkt_color == e_RTE_METER_GREEN) && (tc >= pkt_len)) { + m->tc = tc - pkt_len; + m->te = te; + return e_RTE_METER_GREEN; + } + + if ((pkt_color != e_RTE_METER_RED) && (te >= pkt_len)) { + m->tc = tc; + m->te = te - pkt_len; + return e_RTE_METER_YELLOW; + } + + m->tc = tc; + m->te = te; + return e_RTE_METER_RED; +} + +static inline enum rte_meter_color +rte_meter_trtcm_color_blind_check(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p, + uint64_t time, + uint32_t pkt_len) +{ + uint64_t time_diff_tc, time_diff_tp, n_periods_tc, n_periods_tp, tc, tp; + + /* Bucket update */ + time_diff_tc = time - m->time_tc; + time_diff_tp = time - m->time_tp; + n_periods_tc = time_diff_tc / p->cir_period; + n_periods_tp = time_diff_tp / p->pir_period; + m->time_tc += n_periods_tc * p->cir_period; + m->time_tp += n_periods_tp * p->pir_period; + + tc = m->tc + n_periods_tc * p->cir_bytes_per_period; + if (tc > p->cbs) + tc = p->cbs; + + tp = m->tp + n_periods_tp * p->pir_bytes_per_period; + if (tp > p->pbs) + tp = p->pbs; + + /* Color logic */ + if (tp < pkt_len) { + m->tc = tc; + m->tp = tp; + return e_RTE_METER_RED; + } + + if (tc < pkt_len) { + m->tc = tc; + m->tp = tp - pkt_len; + return e_RTE_METER_YELLOW; + } + + m->tc = tc - pkt_len; + m->tp = tp - pkt_len; + return e_RTE_METER_GREEN; +} + +static inline enum rte_meter_color +rte_meter_trtcm_color_aware_check(struct rte_meter_trtcm *m, + struct rte_meter_trtcm_profile *p, + uint64_t time, + uint32_t pkt_len, + enum rte_meter_color pkt_color) +{ + uint64_t time_diff_tc, time_diff_tp, n_periods_tc, n_periods_tp, tc, tp; + + /* Bucket update */ + time_diff_tc = time - m->time_tc; + time_diff_tp = time - m->time_tp; + n_periods_tc = time_diff_tc / p->cir_period; + n_periods_tp = time_diff_tp / p->pir_period; + m->time_tc += n_periods_tc * p->cir_period; + m->time_tp += n_periods_tp * p->pir_period; + + tc = m->tc + n_periods_tc * p->cir_bytes_per_period; + if (tc > p->cbs) + tc = p->cbs; + + tp = m->tp + n_periods_tp * p->pir_bytes_per_period; + if (tp > p->pbs) + tp = p->pbs; + + /* Color logic */ + if ((pkt_color == e_RTE_METER_RED) || (tp < pkt_len)) { + m->tc = tc; + m->tp = tp; + return e_RTE_METER_RED; + } + + if ((pkt_color == e_RTE_METER_YELLOW) || (tc < pkt_len)) { + m->tc = tc; + m->tp = tp - pkt_len; + return e_RTE_METER_YELLOW; + } + + m->tc = tc - pkt_len; + m->tp = tp - pkt_len; + return e_RTE_METER_GREEN; +} + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_METER_H__ */ diff --git a/src/spdk/dpdk/lib/librte_meter/rte_meter_version.map b/src/spdk/dpdk/lib/librte_meter/rte_meter_version.map new file mode 100644 index 00000000..cb79f0c2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_meter/rte_meter_version.map @@ -0,0 +1,19 @@ +DPDK_2.0 { + global: + + rte_meter_srtcm_color_aware_check; + rte_meter_srtcm_color_blind_check; + rte_meter_srtcm_config; + rte_meter_trtcm_color_aware_check; + rte_meter_trtcm_color_blind_check; + rte_meter_trtcm_config; + + local: *; +}; + +DPDK_18.08 { + global: + + rte_meter_srtcm_profile_config; + rte_meter_trtcm_profile_config; +}; diff --git a/src/spdk/dpdk/lib/librte_metrics/Makefile b/src/spdk/dpdk/lib/librte_metrics/Makefile new file mode 100644 index 00000000..20e242f7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_metrics/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_metrics.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_metrics_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_METRICS) := rte_metrics.c + +# Install header file +SYMLINK-$(CONFIG_RTE_LIBRTE_METRICS)-include += rte_metrics.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_metrics/meson.build b/src/spdk/dpdk/lib/librte_metrics/meson.build new file mode 100644 index 00000000..e26d1f46 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_metrics/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_metrics.c') +headers = files('rte_metrics.h') diff --git a/src/spdk/dpdk/lib/librte_metrics/rte_metrics.c b/src/spdk/dpdk/lib/librte_metrics/rte_metrics.c new file mode 100644 index 00000000..99a96b65 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_metrics/rte_metrics.c @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define RTE_METRICS_MAX_METRICS 256 +#define RTE_METRICS_MEMZONE_NAME "RTE_METRICS" + +/** + * Internal stats metadata and value entry. + * + * @internal + */ +struct rte_metrics_meta_s { + /** Name of metric */ + char name[RTE_METRICS_MAX_NAME_LEN]; + /** Current value for metric */ + uint64_t value[RTE_MAX_ETHPORTS]; + /** Used for global metrics */ + uint64_t global_value; + /** Index of next root element (zero for none) */ + uint16_t idx_next_set; + /** Index of next metric in set (zero for none) */ + uint16_t idx_next_stat; +}; + +/** + * Internal stats info structure. + * + * @internal + * Offsets into metadata are used instead of pointers because ASLR + * means that having the same physical addresses in different + * processes is not guaranteed. + */ +struct rte_metrics_data_s { + /** Index of last metadata entry with valid data. + * This value is not valid if cnt_stats is zero. + */ + uint16_t idx_last_set; + /** Number of metrics. */ + uint16_t cnt_stats; + /** Metric data memory block. */ + struct rte_metrics_meta_s metadata[RTE_METRICS_MAX_METRICS]; + /** Metric data access lock */ + rte_spinlock_t lock; +}; + +void +rte_metrics_init(int socket_id) +{ + struct rte_metrics_data_s *stats; + const struct rte_memzone *memzone; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return; + + memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); + if (memzone != NULL) + return; + memzone = rte_memzone_reserve(RTE_METRICS_MEMZONE_NAME, + sizeof(struct rte_metrics_data_s), socket_id, 0); + if (memzone == NULL) + rte_exit(EXIT_FAILURE, "Unable to allocate stats memzone\n"); + stats = memzone->addr; + memset(stats, 0, sizeof(struct rte_metrics_data_s)); + rte_spinlock_init(&stats->lock); +} + +int +rte_metrics_reg_name(const char *name) +{ + const char * const list_names[] = {name}; + + return rte_metrics_reg_names(list_names, 1); +} + +int +rte_metrics_reg_names(const char * const *names, uint16_t cnt_names) +{ + struct rte_metrics_meta_s *entry = NULL; + struct rte_metrics_data_s *stats; + const struct rte_memzone *memzone; + uint16_t idx_name; + uint16_t idx_base; + + /* Some sanity checks */ + if (cnt_names < 1 || names == NULL) + return -EINVAL; + for (idx_name = 0; idx_name < cnt_names; idx_name++) + if (names[idx_name] == NULL) + return -EINVAL; + + memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); + if (memzone == NULL) + return -EIO; + stats = memzone->addr; + + if (stats->cnt_stats + cnt_names >= RTE_METRICS_MAX_METRICS) + return -ENOMEM; + + rte_spinlock_lock(&stats->lock); + + /* Overwritten later if this is actually first set.. */ + stats->metadata[stats->idx_last_set].idx_next_set = stats->cnt_stats; + + stats->idx_last_set = idx_base = stats->cnt_stats; + + for (idx_name = 0; idx_name < cnt_names; idx_name++) { + entry = &stats->metadata[idx_name + stats->cnt_stats]; + strlcpy(entry->name, names[idx_name], RTE_METRICS_MAX_NAME_LEN); + memset(entry->value, 0, sizeof(entry->value)); + entry->idx_next_stat = idx_name + stats->cnt_stats + 1; + } + entry->idx_next_stat = 0; + entry->idx_next_set = 0; + stats->cnt_stats += cnt_names; + + rte_spinlock_unlock(&stats->lock); + + return idx_base; +} + +int +rte_metrics_update_value(int port_id, uint16_t key, const uint64_t value) +{ + return rte_metrics_update_values(port_id, key, &value, 1); +} + +int +rte_metrics_update_values(int port_id, + uint16_t key, + const uint64_t *values, + uint32_t count) +{ + struct rte_metrics_meta_s *entry; + struct rte_metrics_data_s *stats; + const struct rte_memzone *memzone; + uint16_t idx_metric; + uint16_t idx_value; + uint16_t cnt_setsize; + + if (port_id != RTE_METRICS_GLOBAL && + (port_id < 0 || port_id >= RTE_MAX_ETHPORTS)) + return -EINVAL; + + if (values == NULL) + return -EINVAL; + + memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); + if (memzone == NULL) + return -EIO; + stats = memzone->addr; + + rte_spinlock_lock(&stats->lock); + + if (key >= stats->cnt_stats) { + rte_spinlock_unlock(&stats->lock); + return -EINVAL; + } + idx_metric = key; + cnt_setsize = 1; + while (idx_metric < stats->cnt_stats) { + entry = &stats->metadata[idx_metric]; + if (entry->idx_next_stat == 0) + break; + cnt_setsize++; + idx_metric++; + } + /* Check update does not cross set border */ + if (count > cnt_setsize) { + rte_spinlock_unlock(&stats->lock); + return -ERANGE; + } + + if (port_id == RTE_METRICS_GLOBAL) + for (idx_value = 0; idx_value < count; idx_value++) { + idx_metric = key + idx_value; + stats->metadata[idx_metric].global_value = + values[idx_value]; + } + else + for (idx_value = 0; idx_value < count; idx_value++) { + idx_metric = key + idx_value; + stats->metadata[idx_metric].value[port_id] = + values[idx_value]; + } + rte_spinlock_unlock(&stats->lock); + return 0; +} + +int +rte_metrics_get_names(struct rte_metric_name *names, + uint16_t capacity) +{ + struct rte_metrics_data_s *stats; + const struct rte_memzone *memzone; + uint16_t idx_name; + int return_value; + + memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); + if (memzone == NULL) + return -EIO; + + stats = memzone->addr; + rte_spinlock_lock(&stats->lock); + if (names != NULL) { + if (capacity < stats->cnt_stats) { + return_value = stats->cnt_stats; + rte_spinlock_unlock(&stats->lock); + return return_value; + } + for (idx_name = 0; idx_name < stats->cnt_stats; idx_name++) + strlcpy(names[idx_name].name, + stats->metadata[idx_name].name, + RTE_METRICS_MAX_NAME_LEN); + } + return_value = stats->cnt_stats; + rte_spinlock_unlock(&stats->lock); + return return_value; +} + +int +rte_metrics_get_values(int port_id, + struct rte_metric_value *values, + uint16_t capacity) +{ + struct rte_metrics_meta_s *entry; + struct rte_metrics_data_s *stats; + const struct rte_memzone *memzone; + uint16_t idx_name; + int return_value; + + if (port_id != RTE_METRICS_GLOBAL && + (port_id < 0 || port_id >= RTE_MAX_ETHPORTS)) + return -EINVAL; + + memzone = rte_memzone_lookup(RTE_METRICS_MEMZONE_NAME); + if (memzone == NULL) + return -EIO; + + stats = memzone->addr; + rte_spinlock_lock(&stats->lock); + + if (values != NULL) { + if (capacity < stats->cnt_stats) { + return_value = stats->cnt_stats; + rte_spinlock_unlock(&stats->lock); + return return_value; + } + if (port_id == RTE_METRICS_GLOBAL) + for (idx_name = 0; + idx_name < stats->cnt_stats; + idx_name++) { + entry = &stats->metadata[idx_name]; + values[idx_name].key = idx_name; + values[idx_name].value = entry->global_value; + } + else + for (idx_name = 0; + idx_name < stats->cnt_stats; + idx_name++) { + entry = &stats->metadata[idx_name]; + values[idx_name].key = idx_name; + values[idx_name].value = entry->value[port_id]; + } + } + return_value = stats->cnt_stats; + rte_spinlock_unlock(&stats->lock); + return return_value; +} diff --git a/src/spdk/dpdk/lib/librte_metrics/rte_metrics.h b/src/spdk/dpdk/lib/librte_metrics/rte_metrics.h new file mode 100644 index 00000000..67a60fad --- /dev/null +++ b/src/spdk/dpdk/lib/librte_metrics/rte_metrics.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +/** + * @file + * + * DPDK Metrics module + * + * Metrics are statistics that are not generated by PMDs, and hence + * are better reported through a mechanism that is independent from + * the ethdev-based extended statistics. Providers will typically + * be other libraries and consumers will typically be applications. + * + * Metric information is populated using a push model, where producers + * update the values contained within the metric library by calling + * an update function on the relevant metrics. Consumers receive + * metric information by querying the central metric data, which is + * held in shared memory. Currently only bulk querying of metrics + * by consumers is supported. + */ + +#ifndef _RTE_METRICS_H_ +#define _RTE_METRICS_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** Maximum length of metric name (including null-terminator) */ +#define RTE_METRICS_MAX_NAME_LEN 64 + +/** + * Global metric special id. + * + * When used for the port_id parameter when calling + * rte_metrics_update_metric() or rte_metrics_update_metric(), + * the global metric, which are not associated with any specific + * port (i.e. device), are updated. + */ +#define RTE_METRICS_GLOBAL -1 + + +/** + * A name-key lookup for metrics. + * + * An array of this structure is returned by rte_metrics_get_names(). + * The struct rte_metric_value references these names via their array index. + */ +struct rte_metric_name { + /** String describing metric */ + char name[RTE_METRICS_MAX_NAME_LEN]; +}; + + +/** + * Metric value structure. + * + * This structure is used by rte_metrics_get_values() to return metrics, + * which are statistics that are not generated by PMDs. It maps a name key, + * which corresponds to an index in the array returned by + * rte_metrics_get_names(). + */ +struct rte_metric_value { + /** Numeric identifier of metric. */ + uint16_t key; + /** Value for metric */ + uint64_t value; +}; + + +/** + * Initializes metric module. This function must be called from + * a primary process before metrics are used. + * + * @param socket_id + * Socket to use for shared memory allocation. + */ +void rte_metrics_init(int socket_id); + +/** + * Register a metric, making it available as a reporting parameter. + * + * Registering a metric is the way producers declare a parameter + * that they wish to be reported. Once registered, the associated + * numeric key can be obtained via rte_metrics_get_names(), which + * is required for updating said metric's value. + * + * @param name + * Metric name. If this exceeds RTE_METRICS_MAX_NAME_LEN (including + * the NULL terminator), it is truncated. + * + * @return + * - Zero or positive: Success (index key of new metric) + * - -EIO: Error, unable to access metrics shared memory + * (rte_metrics_init() not called) + * - -EINVAL: Error, invalid parameters + * - -ENOMEM: Error, maximum metrics reached + */ +int rte_metrics_reg_name(const char *name); + +/** + * Register a set of metrics. + * + * This is a bulk version of rte_metrics_reg_names() and aside from + * handling multiple keys at once is functionally identical. + * + * @param names + * List of metric names + * + * @param cnt_names + * Number of metrics in set + * + * @return + * - Zero or positive: Success (index key of start of set) + * - -EIO: Error, unable to access metrics shared memory + * (rte_metrics_init() not called) + * - -EINVAL: Error, invalid parameters + * - -ENOMEM: Error, maximum metrics reached + */ +int rte_metrics_reg_names(const char * const *names, uint16_t cnt_names); + +/** + * Get metric name-key lookup table. + * + * @param names + * A struct rte_metric_name array of at least *capacity* in size to + * receive key names. If this is NULL, function returns the required + * number of elements for this array. + * + * @param capacity + * Size (number of elements) of struct rte_metric_name array. + * Disregarded if names is NULL. + * + * @return + * - Positive value above capacity: error, *names* is too small. + * Return value is required size. + * - Positive value equal or less than capacity: Success. Return + * value is number of elements filled in. + * - Negative value: error. + */ +int rte_metrics_get_names( + struct rte_metric_name *names, + uint16_t capacity); + +/** + * Get metric value table. + * + * @param port_id + * Port id to query + * + * @param values + * A struct rte_metric_value array of at least *capacity* in size to + * receive metric ids and values. If this is NULL, function returns + * the required number of elements for this array. + * + * @param capacity + * Size (number of elements) of struct rte_metric_value array. + * Disregarded if names is NULL. + * + * @return + * - Positive value above capacity: error, *values* is too small. + * Return value is required size. + * - Positive value equal or less than capacity: Success. Return + * value is number of elements filled in. + * - Negative value: error. + */ +int rte_metrics_get_values( + int port_id, + struct rte_metric_value *values, + uint16_t capacity); + +/** + * Updates a metric + * + * @param port_id + * Port to update metrics for + * @param key + * Id of metric to update + * @param value + * New value + * + * @return + * - -EIO if unable to access shared metrics memory + * - Zero on success + */ +int rte_metrics_update_value( + int port_id, + uint16_t key, + const uint64_t value); + +/** + * Updates a metric set. Note that it is an error to try to + * update across a set boundary. + * + * @param port_id + * Port to update metrics for + * @param key + * Base id of metrics set to update + * @param values + * Set of new values + * @param count + * Number of new values + * + * @return + * - -ERANGE if count exceeds metric set size + * - -EIO if unable to access shared metrics memory + * - Zero on success + */ +int rte_metrics_update_values( + int port_id, + uint16_t key, + const uint64_t *values, + uint32_t count); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_metrics/rte_metrics_version.map b/src/spdk/dpdk/lib/librte_metrics/rte_metrics_version.map new file mode 100644 index 00000000..4c5234cd --- /dev/null +++ b/src/spdk/dpdk/lib/librte_metrics/rte_metrics_version.map @@ -0,0 +1,13 @@ +DPDK_17.05 { + global: + + rte_metrics_get_names; + rte_metrics_get_values; + rte_metrics_init; + rte_metrics_reg_name; + rte_metrics_reg_names; + rte_metrics_update_value; + rte_metrics_update_values; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_net/Makefile b/src/spdk/dpdk/lib/librte_net/Makefile new file mode 100644 index 00000000..85e403f4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_net.a + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_mbuf -lrte_eal -lrte_mempool + +EXPORT_MAP := rte_net_version.map +LIBABIVER := 1 + +SRCS-$(CONFIG_RTE_LIBRTE_NET) := rte_net.c +SRCS-$(CONFIG_RTE_LIBRTE_NET) += rte_net_crc.c +SRCS-$(CONFIG_RTE_LIBRTE_NET) += rte_arp.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_esp.h +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_sctp.h rte_icmp.h rte_arp.h +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_ether.h rte_gre.h rte_net.h +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include += rte_net_crc.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_net/meson.build b/src/spdk/dpdk/lib/librte_net/meson.build new file mode 100644 index 00000000..d3ea1feb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/meson.build @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 1 +allow_experimental_apis = true +headers = files('rte_ip.h', + 'rte_tcp.h', + 'rte_udp.h', + 'rte_esp.h', + 'rte_sctp.h', + 'rte_icmp.h', + 'rte_arp.h', + 'rte_ether.h', + 'rte_gre.h', + 'rte_net.h', + 'rte_net_crc.h') + +sources = files('rte_arp.c', 'rte_net.c', 'rte_net_crc.c') +deps += ['mbuf'] diff --git a/src/spdk/dpdk/lib/librte_net/net_crc_neon.h b/src/spdk/dpdk/lib/librte_net/net_crc_neon.h new file mode 100644 index 00000000..63fa1d4a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/net_crc_neon.h @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef _NET_CRC_NEON_H_ +#define _NET_CRC_NEON_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** PMULL CRC computation context structure */ +struct crc_pmull_ctx { + uint64x2_t rk1_rk2; + uint64x2_t rk5_rk6; + uint64x2_t rk7_rk8; +}; + +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16); +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16); + +/** + * @brief Performs one folding round + * + * Logically function operates as follows: + * DATA = READ_NEXT_16BYTES(); + * F1 = LSB8(FOLD) + * F2 = MSB8(FOLD) + * T1 = CLMUL(F1, RK1) + * T2 = CLMUL(F2, RK2) + * FOLD = XOR(T1, T2, DATA) + * + * @param data_block 16 byte data block + * @param precomp precomputed rk1 constant + * @param fold running 16 byte folded data + * + * @return New 16 byte folded data + */ +static inline uint64x2_t +crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp, + uint64x2_t fold) +{ + uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); + + uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); + + return veorq_u64(tmp1, veorq_u64(data_block, tmp0)); +} + +/** + * Performs reduction from 128 bits to 64 bits + * + * @param data128 128 bits data to be reduced + * @param precomp rk5 and rk6 precomputed constants + * + * @return data reduced to 64 bits + */ +static inline uint64x2_t +crcr32_reduce_128_to_64(uint64x2_t data128, + uint64x2_t precomp) +{ + uint64x2_t tmp0, tmp1, tmp2; + + /* 64b fold */ + tmp0 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); + tmp1 = vshift_bytes_right(data128, 8); + tmp0 = veorq_u64(tmp0, tmp1); + + /* 32b fold */ + tmp2 = vshift_bytes_left(tmp0, 4); + tmp1 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); + + return veorq_u64(tmp1, tmp0); +} + +/** + * Performs Barret's reduction from 64 bits to 32 bits + * + * @param data64 64 bits data to be reduced + * @param precomp rk7 precomputed constant + * + * @return data reduced to 32 bits + */ +static inline uint32_t +crcr32_reduce_64_to_32(uint64x2_t data64, + uint64x2_t precomp) +{ + static uint32_t mask1[4] __rte_aligned(16) = { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + }; + static uint32_t mask2[4] __rte_aligned(16) = { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff + }; + uint64x2_t tmp0, tmp1, tmp2; + + tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2)); + + tmp1 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); + tmp1 = veorq_u64(tmp1, tmp0); + tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1)); + + tmp2 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); + tmp2 = veorq_u64(tmp2, tmp1); + tmp2 = veorq_u64(tmp2, tmp0); + + return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2); +} + +static inline uint32_t +crc32_eth_calc_pmull( + const uint8_t *data, + uint32_t data_len, + uint32_t crc, + const struct crc_pmull_ctx *params) +{ + uint64x2_t temp, fold, k; + uint32_t n; + + /* Get CRC init value */ + temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0)); + + /** + * Folding all data into single 16 byte data block + * Assumes: fold holds first 16 bytes of data + */ + if (unlikely(data_len < 32)) { + if (unlikely(data_len == 16)) { + /* 16 bytes */ + fold = vld1q_u64((const uint64_t *)data); + fold = veorq_u64(fold, temp); + goto reduction_128_64; + } + + if (unlikely(data_len < 16)) { + /* 0 to 15 bytes */ + uint8_t buffer[16] __rte_aligned(16); + + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, data, data_len); + + fold = vld1q_u64((uint64_t *)buffer); + fold = veorq_u64(fold, temp); + if (unlikely(data_len < 4)) { + fold = vshift_bytes_left(fold, 8 - data_len); + goto barret_reduction; + } + fold = vshift_bytes_left(fold, 16 - data_len); + goto reduction_128_64; + } + /* 17 to 31 bytes */ + fold = vld1q_u64((const uint64_t *)data); + fold = veorq_u64(fold, temp); + n = 16; + k = params->rk1_rk2; + goto partial_bytes; + } + + /** At least 32 bytes in the buffer */ + /** Apply CRC initial value */ + fold = vld1q_u64((const uint64_t *)data); + fold = veorq_u64(fold, temp); + + /** Main folding loop - the last 16 bytes is processed separately */ + k = params->rk1_rk2; + for (n = 16; (n + 16) <= data_len; n += 16) { + temp = vld1q_u64((const uint64_t *)&data[n]); + fold = crcr32_folding_round(temp, k, fold); + } + +partial_bytes: + if (likely(n < data_len)) { + uint64x2_t last16, a, b, mask; + uint32_t rem = data_len & 15; + + last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]); + a = vshift_bytes_left(fold, 16 - rem); + b = vshift_bytes_right(fold, rem); + mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem); + b = vorrq_u64(b, vandq_u64(mask, last16)); + + /* k = rk1 & rk2 */ + temp = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(a), 1), + vgetq_lane_p64(vreinterpretq_p64_u64(k), 0))); + fold = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(a), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(k), 1))); + fold = veorq_u64(fold, temp); + fold = veorq_u64(fold, b); + } + + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ +reduction_128_64: + k = params->rk5_rk6; + fold = crcr32_reduce_128_to_64(fold, k); + +barret_reduction: + k = params->rk7_rk8; + n = crcr32_reduce_64_to_32(fold, k); + + return n; +} + +static inline void +rte_net_crc_neon_init(void) +{ + /* Initialize CRC16 data */ + uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU}; + uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU}; + uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU}; + + /* Initialize CRC32 data */ + uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU}; + uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU}; + uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU}; + + /** Save the params in context structure */ + crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2); + crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6); + crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8); + + /** Save the params in context structure */ + crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2); + crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6); + crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8); +} + +static inline uint32_t +rte_crc16_ccitt_neon_handler(const uint8_t *data, + uint32_t data_len) +{ + return (uint16_t)~crc32_eth_calc_pmull(data, + data_len, + 0xffff, + &crc16_ccitt_pmull); +} + +static inline uint32_t +rte_crc32_eth_neon_handler(const uint8_t *data, + uint32_t data_len) +{ + return ~crc32_eth_calc_pmull(data, + data_len, + 0xffffffffUL, + &crc32_eth_pmull); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _NET_CRC_NEON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/net_crc_sse.h b/src/spdk/dpdk/lib/librte_net/net_crc_sse.h new file mode 100644 index 00000000..da815243 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/net_crc_sse.h @@ -0,0 +1,334 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_NET_CRC_SSE_H_ +#define _RTE_NET_CRC_SSE_H_ + +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** PCLMULQDQ CRC computation context structure */ +struct crc_pclmulqdq_ctx { + __m128i rk1_rk2; + __m128i rk5_rk6; + __m128i rk7_rk8; +}; + +struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16); +struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16); +/** + * @brief Performs one folding round + * + * Logically function operates as follows: + * DATA = READ_NEXT_16BYTES(); + * F1 = LSB8(FOLD) + * F2 = MSB8(FOLD) + * T1 = CLMUL(F1, RK1) + * T2 = CLMUL(F2, RK2) + * FOLD = XOR(T1, T2, DATA) + * + * @param data_block + * 16 byte data block + * @param precomp + * Precomputed rk1 constant + * @param fold + * Current16 byte folded data + * + * @return + * New 16 byte folded data + */ +static __rte_always_inline __m128i +crcr32_folding_round(__m128i data_block, + __m128i precomp, + __m128i fold) +{ + __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01); + __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10); + + return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0)); +} + +/** + * Performs reduction from 128 bits to 64 bits + * + * @param data128 + * 128 bits data to be reduced + * @param precomp + * precomputed constants rk5, rk6 + * + * @return + * 64 bits reduced data + */ + +static __rte_always_inline __m128i +crcr32_reduce_128_to_64(__m128i data128, __m128i precomp) +{ + __m128i tmp0, tmp1, tmp2; + + /* 64b fold */ + tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00); + tmp1 = _mm_srli_si128(data128, 8); + tmp0 = _mm_xor_si128(tmp0, tmp1); + + /* 32b fold */ + tmp2 = _mm_slli_si128(tmp0, 4); + tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10); + + return _mm_xor_si128(tmp1, tmp0); +} + +/** + * Performs Barret's reduction from 64 bits to 32 bits + * + * @param data64 + * 64 bits data to be reduced + * @param precomp + * rk7 precomputed constant + * + * @return + * reduced 32 bits data + */ + +static __rte_always_inline uint32_t +crcr32_reduce_64_to_32(__m128i data64, __m128i precomp) +{ + static const uint32_t mask1[4] __rte_aligned(16) = { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + }; + + static const uint32_t mask2[4] __rte_aligned(16) = { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff + }; + __m128i tmp0, tmp1, tmp2; + + tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2)); + + tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00); + tmp1 = _mm_xor_si128(tmp1, tmp0); + tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1)); + + tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10); + tmp2 = _mm_xor_si128(tmp2, tmp1); + tmp2 = _mm_xor_si128(tmp2, tmp0); + + return _mm_extract_epi32(tmp2, 2); +} + +static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** + * Shifts left 128 bit register by specified number of bytes + * + * @param reg + * 128 bit value + * @param num + * number of bytes to shift left reg by (0-16) + * + * @return + * reg << (num * 8) + */ + +static __rte_always_inline __m128i +xmm_shift_left(__m128i reg, const unsigned int num) +{ + const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num); + + return _mm_shuffle_epi8(reg, _mm_loadu_si128(p)); +} + +static __rte_always_inline uint32_t +crc32_eth_calc_pclmulqdq( + const uint8_t *data, + uint32_t data_len, + uint32_t crc, + const struct crc_pclmulqdq_ctx *params) +{ + __m128i temp, fold, k; + uint32_t n; + + /* Get CRC init value */ + temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); + + /** + * Folding all data into single 16 byte data block + * Assumes: fold holds first 16 bytes of data + */ + + if (unlikely(data_len < 32)) { + if (unlikely(data_len == 16)) { + /* 16 bytes */ + fold = _mm_loadu_si128((const __m128i *)data); + fold = _mm_xor_si128(fold, temp); + goto reduction_128_64; + } + + if (unlikely(data_len < 16)) { + /* 0 to 15 bytes */ + uint8_t buffer[16] __rte_aligned(16); + + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, data, data_len); + + fold = _mm_load_si128((const __m128i *)buffer); + fold = _mm_xor_si128(fold, temp); + if (unlikely(data_len < 4)) { + fold = xmm_shift_left(fold, 8 - data_len); + goto barret_reduction; + } + fold = xmm_shift_left(fold, 16 - data_len); + goto reduction_128_64; + } + /* 17 to 31 bytes */ + fold = _mm_loadu_si128((const __m128i *)data); + fold = _mm_xor_si128(fold, temp); + n = 16; + k = params->rk1_rk2; + goto partial_bytes; + } + + /** At least 32 bytes in the buffer */ + /** Apply CRC initial value */ + fold = _mm_loadu_si128((const __m128i *)data); + fold = _mm_xor_si128(fold, temp); + + /** Main folding loop - the last 16 bytes is processed separately */ + k = params->rk1_rk2; + for (n = 16; (n + 16) <= data_len; n += 16) { + temp = _mm_loadu_si128((const __m128i *)&data[n]); + fold = crcr32_folding_round(temp, k, fold); + } + +partial_bytes: + if (likely(n < data_len)) { + + const uint32_t mask3[4] __rte_aligned(16) = { + 0x80808080, 0x80808080, 0x80808080, 0x80808080 + }; + + const uint8_t shf_table[32] __rte_aligned(16) = { + 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; + + __m128i last16, a, b; + + last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]); + + temp = _mm_loadu_si128((const __m128i *) + &shf_table[data_len & 15]); + a = _mm_shuffle_epi8(fold, temp); + + temp = _mm_xor_si128(temp, + _mm_load_si128((const __m128i *)mask3)); + b = _mm_shuffle_epi8(fold, temp); + b = _mm_blendv_epi8(b, last16, temp); + + /* k = rk1 & rk2 */ + temp = _mm_clmulepi64_si128(a, k, 0x01); + fold = _mm_clmulepi64_si128(a, k, 0x10); + + fold = _mm_xor_si128(fold, temp); + fold = _mm_xor_si128(fold, b); + } + + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ +reduction_128_64: + k = params->rk5_rk6; + fold = crcr32_reduce_128_to_64(fold, k); + +barret_reduction: + k = params->rk7_rk8; + n = crcr32_reduce_64_to_32(fold, k); + + return n; +} + + +static inline void +rte_net_crc_sse42_init(void) +{ + uint64_t k1, k2, k5, k6; + uint64_t p = 0, q = 0; + + /** Initialize CRC16 data */ + k1 = 0x189aeLLU; + k2 = 0x8e10LLU; + k5 = 0x189aeLLU; + k6 = 0x114aaLLU; + q = 0x11c581910LLU; + p = 0x10811LLU; + + /** Save the params in context structure */ + crc16_ccitt_pclmulqdq.rk1_rk2 = + _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); + crc16_ccitt_pclmulqdq.rk5_rk6 = + _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); + crc16_ccitt_pclmulqdq.rk7_rk8 = + _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); + + /** Initialize CRC32 data */ + k1 = 0xccaa009eLLU; + k2 = 0x1751997d0LLU; + k5 = 0xccaa009eLLU; + k6 = 0x163cd6124LLU; + q = 0x1f7011640LLU; + p = 0x1db710641LLU; + + /** Save the params in context structure */ + crc32_eth_pclmulqdq.rk1_rk2 = + _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); + crc32_eth_pclmulqdq.rk5_rk6 = + _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); + crc32_eth_pclmulqdq.rk7_rk8 = + _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); + + /** + * Reset the register as following calculation may + * use other data types such as float, double, etc. + */ + _mm_empty(); + +} + +static inline uint32_t +rte_crc16_ccitt_sse42_handler(const uint8_t *data, + uint32_t data_len) +{ + /** return 16-bit CRC value */ + return (uint16_t)~crc32_eth_calc_pclmulqdq(data, + data_len, + 0xffff, + &crc16_ccitt_pclmulqdq); +} + +static inline uint32_t +rte_crc32_eth_sse42_handler(const uint8_t *data, + uint32_t data_len) +{ + return ~crc32_eth_calc_pclmulqdq(data, + data_len, + 0xffffffffUL, + &crc32_eth_pclmulqdq); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_NET_CRC_SSE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_arp.c b/src/spdk/dpdk/lib/librte_net/rte_arp.c new file mode 100644 index 00000000..f0ed9bd6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_arp.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include + +#include + +#define RARP_PKT_SIZE 64 +struct rte_mbuf * __rte_experimental +rte_net_make_rarp_packet(struct rte_mempool *mpool, + const struct ether_addr *mac) +{ + struct ether_hdr *eth_hdr; + struct arp_hdr *rarp; + struct rte_mbuf *mbuf; + + if (mpool == NULL) + return NULL; + + mbuf = rte_pktmbuf_alloc(mpool); + if (mbuf == NULL) + return NULL; + + eth_hdr = (struct ether_hdr *)rte_pktmbuf_append(mbuf, RARP_PKT_SIZE); + if (eth_hdr == NULL) { + rte_pktmbuf_free(mbuf); + return NULL; + } + + /* Ethernet header. */ + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); + ether_addr_copy(mac, ð_hdr->s_addr); + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); + + /* RARP header. */ + rarp = (struct arp_hdr *)(eth_hdr + 1); + rarp->arp_hrd = htons(ARP_HRD_ETHER); + rarp->arp_pro = htons(ETHER_TYPE_IPv4); + rarp->arp_hln = ETHER_ADDR_LEN; + rarp->arp_pln = 4; + rarp->arp_op = htons(ARP_OP_REVREQUEST); + + ether_addr_copy(mac, &rarp->arp_data.arp_sha); + ether_addr_copy(mac, &rarp->arp_data.arp_tha); + memset(&rarp->arp_data.arp_sip, 0x00, 4); + memset(&rarp->arp_data.arp_tip, 0x00, 4); + + return mbuf; +} diff --git a/src/spdk/dpdk/lib/librte_net/rte_arp.h b/src/spdk/dpdk/lib/librte_net/rte_arp.h new file mode 100644 index 00000000..139a84ca --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_arp.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2013 6WIND S.A. + */ + +#ifndef _RTE_ARP_H_ +#define _RTE_ARP_H_ + +/** + * @file + * + * ARP-related defines + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * ARP header IPv4 payload. + */ +struct arp_ipv4 { + struct ether_addr arp_sha; /**< sender hardware address */ + uint32_t arp_sip; /**< sender IP address */ + struct ether_addr arp_tha; /**< target hardware address */ + uint32_t arp_tip; /**< target IP address */ +} __attribute__((__packed__)); + +/** + * ARP header. + */ +struct arp_hdr { + uint16_t arp_hrd; /* format of hardware address */ +#define ARP_HRD_ETHER 1 /* ARP Ethernet address format */ + + uint16_t arp_pro; /* format of protocol address */ + uint8_t arp_hln; /* length of hardware address */ + uint8_t arp_pln; /* length of protocol address */ + uint16_t arp_op; /* ARP opcode (command) */ +#define ARP_OP_REQUEST 1 /* request to resolve address */ +#define ARP_OP_REPLY 2 /* response to previous request */ +#define ARP_OP_REVREQUEST 3 /* request proto addr given hardware */ +#define ARP_OP_REVREPLY 4 /* response giving protocol address */ +#define ARP_OP_INVREQUEST 8 /* request to identify peer */ +#define ARP_OP_INVREPLY 9 /* response identifying peer */ + + struct arp_ipv4 arp_data; +} __attribute__((__packed__)); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Make a RARP packet based on MAC addr. + * + * @param mpool + * Pointer to the rte_mempool + * @param mac + * Pointer to the MAC addr + * + * @return + * - RARP packet pointer on success, or NULL on error + */ +struct rte_mbuf * __rte_experimental +rte_net_make_rarp_packet(struct rte_mempool *mpool, + const struct ether_addr *mac); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ARP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_esp.h b/src/spdk/dpdk/lib/librte_net/rte_esp.h new file mode 100644 index 00000000..f77ec2eb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_esp.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 Mellanox Technologies, Ltd + */ + +#ifndef _RTE_ESP_H_ +#define _RTE_ESP_H_ + +/** + * @file + * + * ESP-related defines + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * ESP Header + */ +struct esp_hdr { + rte_be32_t spi; /**< Security Parameters Index */ + rte_be32_t seq; /**< packet sequence number */ +} __attribute__((__packed__)); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_ESP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_ether.h b/src/spdk/dpdk/lib/librte_net/rte_ether.h new file mode 100644 index 00000000..bee2b34f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_ether.h @@ -0,0 +1,418 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_ETHER_H_ +#define _RTE_ETHER_H_ + +/** + * @file + * + * Ethernet Helpers in RTE + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include +#include +#include + +#define ETHER_ADDR_LEN 6 /**< Length of Ethernet address. */ +#define ETHER_TYPE_LEN 2 /**< Length of Ethernet type field. */ +#define ETHER_CRC_LEN 4 /**< Length of Ethernet CRC. */ +#define ETHER_HDR_LEN \ + (ETHER_ADDR_LEN * 2 + ETHER_TYPE_LEN) /**< Length of Ethernet header. */ +#define ETHER_MIN_LEN 64 /**< Minimum frame len, including CRC. */ +#define ETHER_MAX_LEN 1518 /**< Maximum frame len, including CRC. */ +#define ETHER_MTU \ + (ETHER_MAX_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN) /**< Ethernet MTU. */ + +#define ETHER_MAX_VLAN_FRAME_LEN \ + (ETHER_MAX_LEN + 4) /**< Maximum VLAN frame length, including CRC. */ + +#define ETHER_MAX_JUMBO_FRAME_LEN \ + 0x3F00 /**< Maximum Jumbo frame length, including CRC. */ + +#define ETHER_MAX_VLAN_ID 4095 /**< Maximum VLAN ID. */ + +#define ETHER_MIN_MTU 68 /**< Minimum MTU for IPv4 packets, see RFC 791. */ + +/** + * Ethernet address: + * A universally administered address is uniquely assigned to a device by its + * manufacturer. The first three octets (in transmission order) contain the + * Organizationally Unique Identifier (OUI). The following three (MAC-48 and + * EUI-48) octets are assigned by that organization with the only constraint + * of uniqueness. + * A locally administered address is assigned to a device by a network + * administrator and does not contain OUIs. + * See http://standards.ieee.org/regauth/groupmac/tutorial.html + */ +struct ether_addr { + uint8_t addr_bytes[ETHER_ADDR_LEN]; /**< Addr bytes in tx order */ +} __attribute__((__packed__)); + +#define ETHER_LOCAL_ADMIN_ADDR 0x02 /**< Locally assigned Eth. address. */ +#define ETHER_GROUP_ADDR 0x01 /**< Multicast or broadcast Eth. address. */ + +/** + * Check if two Ethernet addresses are the same. + * + * @param ea1 + * A pointer to the first ether_addr structure containing + * the ethernet address. + * @param ea2 + * A pointer to the second ether_addr structure containing + * the ethernet address. + * + * @return + * True (1) if the given two ethernet address are the same; + * False (0) otherwise. + */ +static inline int is_same_ether_addr(const struct ether_addr *ea1, + const struct ether_addr *ea2) +{ + int i; + for (i = 0; i < ETHER_ADDR_LEN; i++) + if (ea1->addr_bytes[i] != ea2->addr_bytes[i]) + return 0; + return 1; +} + +/** + * Check if an Ethernet address is filled with zeros. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is filled with zeros; + * false (0) otherwise. + */ +static inline int is_zero_ether_addr(const struct ether_addr *ea) +{ + int i; + for (i = 0; i < ETHER_ADDR_LEN; i++) + if (ea->addr_bytes[i] != 0x00) + return 0; + return 1; +} + +/** + * Check if an Ethernet address is a unicast address. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is a unicast address; + * false (0) otherwise. + */ +static inline int is_unicast_ether_addr(const struct ether_addr *ea) +{ + return (ea->addr_bytes[0] & ETHER_GROUP_ADDR) == 0; +} + +/** + * Check if an Ethernet address is a multicast address. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is a multicast address; + * false (0) otherwise. + */ +static inline int is_multicast_ether_addr(const struct ether_addr *ea) +{ + return ea->addr_bytes[0] & ETHER_GROUP_ADDR; +} + +/** + * Check if an Ethernet address is a broadcast address. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is a broadcast address; + * false (0) otherwise. + */ +static inline int is_broadcast_ether_addr(const struct ether_addr *ea) +{ + const unaligned_uint16_t *ea_words = (const unaligned_uint16_t *)ea; + + return (ea_words[0] == 0xFFFF && ea_words[1] == 0xFFFF && + ea_words[2] == 0xFFFF); +} + +/** + * Check if an Ethernet address is a universally assigned address. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is a universally assigned address; + * false (0) otherwise. + */ +static inline int is_universal_ether_addr(const struct ether_addr *ea) +{ + return (ea->addr_bytes[0] & ETHER_LOCAL_ADMIN_ADDR) == 0; +} + +/** + * Check if an Ethernet address is a locally assigned address. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is a locally assigned address; + * false (0) otherwise. + */ +static inline int is_local_admin_ether_addr(const struct ether_addr *ea) +{ + return (ea->addr_bytes[0] & ETHER_LOCAL_ADMIN_ADDR) != 0; +} + +/** + * Check if an Ethernet address is a valid address. Checks that the address is a + * unicast address and is not filled with zeros. + * + * @param ea + * A pointer to a ether_addr structure containing the ethernet address + * to check. + * @return + * True (1) if the given ethernet address is valid; + * false (0) otherwise. + */ +static inline int is_valid_assigned_ether_addr(const struct ether_addr *ea) +{ + return is_unicast_ether_addr(ea) && (!is_zero_ether_addr(ea)); +} + +/** + * Generate a random Ethernet address that is locally administered + * and not multicast. + * @param addr + * A pointer to Ethernet address. + */ +static inline void eth_random_addr(uint8_t *addr) +{ + uint64_t rand = rte_rand(); + uint8_t *p = (uint8_t *)&rand; + + rte_memcpy(addr, p, ETHER_ADDR_LEN); + addr[0] &= (uint8_t)~ETHER_GROUP_ADDR; /* clear multicast bit */ + addr[0] |= ETHER_LOCAL_ADMIN_ADDR; /* set local assignment bit */ +} + +/** + * Fast copy an Ethernet address. + * + * @param ea_from + * A pointer to a ether_addr structure holding the Ethernet address to copy. + * @param ea_to + * A pointer to a ether_addr structure where to copy the Ethernet address. + */ +static inline void ether_addr_copy(const struct ether_addr *ea_from, + struct ether_addr *ea_to) +{ +#ifdef __INTEL_COMPILER + uint16_t *from_words = (uint16_t *)(ea_from->addr_bytes); + uint16_t *to_words = (uint16_t *)(ea_to->addr_bytes); + + to_words[0] = from_words[0]; + to_words[1] = from_words[1]; + to_words[2] = from_words[2]; +#else + /* + * Use the common way, because of a strange gcc warning. + */ + *ea_to = *ea_from; +#endif +} + +#define ETHER_ADDR_FMT_SIZE 18 +/** + * Format 48bits Ethernet address in pattern xx:xx:xx:xx:xx:xx. + * + * @param buf + * A pointer to buffer contains the formatted MAC address. + * @param size + * The format buffer size. + * @param eth_addr + * A pointer to a ether_addr structure. + */ +static inline void +ether_format_addr(char *buf, uint16_t size, + const struct ether_addr *eth_addr) +{ + snprintf(buf, size, "%02X:%02X:%02X:%02X:%02X:%02X", + eth_addr->addr_bytes[0], + eth_addr->addr_bytes[1], + eth_addr->addr_bytes[2], + eth_addr->addr_bytes[3], + eth_addr->addr_bytes[4], + eth_addr->addr_bytes[5]); +} + +/** + * Ethernet header: Contains the destination address, source address + * and frame type. + */ +struct ether_hdr { + struct ether_addr d_addr; /**< Destination address. */ + struct ether_addr s_addr; /**< Source address. */ + uint16_t ether_type; /**< Frame type. */ +} __attribute__((__packed__)); + +/** + * Ethernet VLAN Header. + * Contains the 16-bit VLAN Tag Control Identifier and the Ethernet type + * of the encapsulated frame. + */ +struct vlan_hdr { + uint16_t vlan_tci; /**< Priority (3) + CFI (1) + Identifier Code (12) */ + uint16_t eth_proto;/**< Ethernet type of encapsulated frame. */ +} __attribute__((__packed__)); + +/** + * VXLAN protocol header. + * Contains the 8-bit flag, 24-bit VXLAN Network Identifier and + * Reserved fields (24 bits and 8 bits) + */ +struct vxlan_hdr { + uint32_t vx_flags; /**< flag (8) + Reserved (24). */ + uint32_t vx_vni; /**< VNI (24) + Reserved (8). */ +} __attribute__((__packed__)); + +/* Ethernet frame types */ +#define ETHER_TYPE_IPv4 0x0800 /**< IPv4 Protocol. */ +#define ETHER_TYPE_IPv6 0x86DD /**< IPv6 Protocol. */ +#define ETHER_TYPE_ARP 0x0806 /**< Arp Protocol. */ +#define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */ +#define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */ +#define ETHER_TYPE_QINQ 0x88A8 /**< IEEE 802.1ad QinQ tagging. */ +#define ETHER_TYPE_ETAG 0x893F /**< IEEE 802.1BR E-Tag. */ +#define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */ +#define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */ +#define ETHER_TYPE_TEB 0x6558 /**< Transparent Ethernet Bridging. */ +#define ETHER_TYPE_LLDP 0x88CC /**< LLDP Protocol. */ + +#define ETHER_VXLAN_HLEN (sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)) +/**< VXLAN tunnel header length. */ + +/** + * VXLAN-GPE protocol header (draft-ietf-nvo3-vxlan-gpe-05). + * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network + * Identifier and Reserved fields (16 bits and 8 bits). + */ +struct vxlan_gpe_hdr { + uint8_t vx_flags; /**< flag (8). */ + uint8_t reserved[2]; /**< Reserved (16). */ + uint8_t proto; /**< next-protocol (8). */ + uint32_t vx_vni; /**< VNI (24) + Reserved (8). */ +} __attribute__((__packed__)); + +/* VXLAN-GPE next protocol types */ +#define VXLAN_GPE_TYPE_IPV4 1 /**< IPv4 Protocol. */ +#define VXLAN_GPE_TYPE_IPV6 2 /**< IPv6 Protocol. */ +#define VXLAN_GPE_TYPE_ETH 3 /**< Ethernet Protocol. */ +#define VXLAN_GPE_TYPE_NSH 4 /**< NSH Protocol. */ +#define VXLAN_GPE_TYPE_MPLS 5 /**< MPLS Protocol. */ +#define VXLAN_GPE_TYPE_GBP 6 /**< GBP Protocol. */ +#define VXLAN_GPE_TYPE_VBNG 7 /**< vBNG Protocol. */ + +#define ETHER_VXLAN_GPE_HLEN (sizeof(struct udp_hdr) + \ + sizeof(struct vxlan_gpe_hdr)) +/**< VXLAN-GPE tunnel header length. */ + +/** + * Extract VLAN tag information into mbuf + * + * Software version of VLAN stripping + * + * @param m + * The packet mbuf. + * @return + * - 0: Success + * - 1: not a vlan packet + */ +static inline int rte_vlan_strip(struct rte_mbuf *m) +{ + struct ether_hdr *eh + = rte_pktmbuf_mtod(m, struct ether_hdr *); + struct vlan_hdr *vh; + + if (eh->ether_type != rte_cpu_to_be_16(ETHER_TYPE_VLAN)) + return -1; + + vh = (struct vlan_hdr *)(eh + 1); + m->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; + m->vlan_tci = rte_be_to_cpu_16(vh->vlan_tci); + + /* Copy ether header over rather than moving whole packet */ + memmove(rte_pktmbuf_adj(m, sizeof(struct vlan_hdr)), + eh, 2 * ETHER_ADDR_LEN); + + return 0; +} + +/** + * Insert VLAN tag into mbuf. + * + * Software version of VLAN unstripping + * + * @param m + * The packet mbuf. + * @return + * - 0: On success + * -EPERM: mbuf is is shared overwriting would be unsafe + * -ENOSPC: not enough headroom in mbuf + */ +static inline int rte_vlan_insert(struct rte_mbuf **m) +{ + struct ether_hdr *oh, *nh; + struct vlan_hdr *vh; + + /* Can't insert header if mbuf is shared */ + if (rte_mbuf_refcnt_read(*m) > 1) { + struct rte_mbuf *copy; + + copy = rte_pktmbuf_clone(*m, (*m)->pool); + if (unlikely(copy == NULL)) + return -ENOMEM; + rte_pktmbuf_free(*m); + *m = copy; + } + + oh = rte_pktmbuf_mtod(*m, struct ether_hdr *); + nh = (struct ether_hdr *) + rte_pktmbuf_prepend(*m, sizeof(struct vlan_hdr)); + if (nh == NULL) + return -ENOSPC; + + memmove(nh, oh, 2 * ETHER_ADDR_LEN); + nh->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); + + vh = (struct vlan_hdr *) (nh + 1); + vh->vlan_tci = rte_cpu_to_be_16((*m)->vlan_tci); + + (*m)->ol_flags &= ~PKT_RX_VLAN_STRIPPED; + + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ETHER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_gre.h b/src/spdk/dpdk/lib/librte_net/rte_gre.h new file mode 100644 index 00000000..69499bb8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_gre.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + */ + +#ifndef _RTE_GRE_H_ +#define _RTE_GRE_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * GRE Header + */ +struct gre_hdr { +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + uint16_t res2:4; /**< Reserved */ + uint16_t s:1; /**< Sequence Number Present bit */ + uint16_t k:1; /**< Key Present bit */ + uint16_t res1:1; /**< Reserved */ + uint16_t c:1; /**< Checksum Present bit */ + uint16_t ver:3; /**< Version Number */ + uint16_t res3:5; /**< Reserved */ +#elif RTE_BYTE_ORDER == RTE_BIG_ENDIAN + uint16_t c:1; /**< Checksum Present bit */ + uint16_t res1:1; /**< Reserved */ + uint16_t k:1; /**< Key Present bit */ + uint16_t s:1; /**< Sequence Number Present bit */ + uint16_t res2:4; /**< Reserved */ + uint16_t res3:5; /**< Reserved */ + uint16_t ver:3; /**< Version Number */ +#endif + uint16_t proto; /**< Protocol Type */ +} __attribute__((__packed__)); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_GRE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_icmp.h b/src/spdk/dpdk/lib/librte_net/rte_icmp.h new file mode 100644 index 00000000..053b5f6a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_icmp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2013 6WIND S.A. + * All rights reserved. + */ + +#ifndef _RTE_ICMP_H_ +#define _RTE_ICMP_H_ + +/** + * @file + * + * ICMP-related defines + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * ICMP Header + */ +struct icmp_hdr { + uint8_t icmp_type; /* ICMP packet type. */ + uint8_t icmp_code; /* ICMP packet code. */ + uint16_t icmp_cksum; /* ICMP packet checksum. */ + uint16_t icmp_ident; /* ICMP packet identifier. */ + uint16_t icmp_seq_nb; /* ICMP packet sequence number. */ +} __attribute__((__packed__)); + +/* ICMP packet types */ +#define IP_ICMP_ECHO_REPLY 0 +#define IP_ICMP_ECHO_REQUEST 8 + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_ICMP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_ip.h b/src/spdk/dpdk/lib/librte_net/rte_ip.h new file mode 100644 index 00000000..f2a8904a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_ip.h @@ -0,0 +1,428 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + * All rights reserved. + */ + +#ifndef _RTE_IP_H_ +#define _RTE_IP_H_ + +/** + * @file + * + * IP-related defines + */ + +#include +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * IPv4 Header + */ +struct ipv4_hdr { + uint8_t version_ihl; /**< version and header length */ + uint8_t type_of_service; /**< type of service */ + uint16_t total_length; /**< length of packet */ + uint16_t packet_id; /**< packet ID */ + uint16_t fragment_offset; /**< fragmentation offset */ + uint8_t time_to_live; /**< time to live */ + uint8_t next_proto_id; /**< protocol ID */ + uint16_t hdr_checksum; /**< header checksum */ + uint32_t src_addr; /**< source address */ + uint32_t dst_addr; /**< destination address */ +} __attribute__((__packed__)); + +/** Create IPv4 address */ +#define IPv4(a,b,c,d) ((uint32_t)(((a) & 0xff) << 24) | \ + (((b) & 0xff) << 16) | \ + (((c) & 0xff) << 8) | \ + ((d) & 0xff)) + +/** Maximal IPv4 packet length (including a header) */ +#define IPV4_MAX_PKT_LEN 65535 + +/** Internet header length mask for version_ihl field */ +#define IPV4_HDR_IHL_MASK (0x0f) +/** + * Internet header length field multiplier (IHL field specifies overall header + * length in number of 4-byte words) + */ +#define IPV4_IHL_MULTIPLIER (4) + +/* Fragment Offset * Flags. */ +#define IPV4_HDR_DF_SHIFT 14 +#define IPV4_HDR_MF_SHIFT 13 +#define IPV4_HDR_FO_SHIFT 3 + +#define IPV4_HDR_DF_FLAG (1 << IPV4_HDR_DF_SHIFT) +#define IPV4_HDR_MF_FLAG (1 << IPV4_HDR_MF_SHIFT) + +#define IPV4_HDR_OFFSET_MASK ((1 << IPV4_HDR_MF_SHIFT) - 1) + +#define IPV4_HDR_OFFSET_UNITS 8 + +/* + * IPv4 address types + */ +#define IPV4_ANY ((uint32_t)0x00000000) /**< 0.0.0.0 */ +#define IPV4_LOOPBACK ((uint32_t)0x7f000001) /**< 127.0.0.1 */ +#define IPV4_BROADCAST ((uint32_t)0xe0000000) /**< 224.0.0.0 */ +#define IPV4_ALLHOSTS_GROUP ((uint32_t)0xe0000001) /**< 224.0.0.1 */ +#define IPV4_ALLRTRS_GROUP ((uint32_t)0xe0000002) /**< 224.0.0.2 */ +#define IPV4_MAX_LOCAL_GROUP ((uint32_t)0xe00000ff) /**< 224.0.0.255 */ + +/* + * IPv4 Multicast-related macros + */ +#define IPV4_MIN_MCAST IPv4(224, 0, 0, 0) /**< Minimal IPv4-multicast address */ +#define IPV4_MAX_MCAST IPv4(239, 255, 255, 255) /**< Maximum IPv4 multicast address */ + +#define IS_IPV4_MCAST(x) \ + ((x) >= IPV4_MIN_MCAST && (x) <= IPV4_MAX_MCAST) /**< check if IPv4 address is multicast */ + +/** + * @internal Calculate a sum of all words in the buffer. + * Helper routine for the rte_raw_cksum(). + * + * @param buf + * Pointer to the buffer. + * @param len + * Length of the buffer. + * @param sum + * Initial value of the sum. + * @return + * sum += Sum of all words in the buffer. + */ +static inline uint32_t +__rte_raw_cksum(const void *buf, size_t len, uint32_t sum) +{ + /* workaround gcc strict-aliasing warning */ + uintptr_t ptr = (uintptr_t)buf; + typedef uint16_t __attribute__((__may_alias__)) u16_p; + const u16_p *u16_buf = (const u16_p *)ptr; + + while (len >= (sizeof(*u16_buf) * 4)) { + sum += u16_buf[0]; + sum += u16_buf[1]; + sum += u16_buf[2]; + sum += u16_buf[3]; + len -= sizeof(*u16_buf) * 4; + u16_buf += 4; + } + while (len >= sizeof(*u16_buf)) { + sum += *u16_buf; + len -= sizeof(*u16_buf); + u16_buf += 1; + } + + /* if length is in odd bytes */ + if (len == 1) + sum += *((const uint8_t *)u16_buf); + + return sum; +} + +/** + * @internal Reduce a sum to the non-complemented checksum. + * Helper routine for the rte_raw_cksum(). + * + * @param sum + * Value of the sum. + * @return + * The non-complemented checksum. + */ +static inline uint16_t +__rte_raw_cksum_reduce(uint32_t sum) +{ + sum = ((sum & 0xffff0000) >> 16) + (sum & 0xffff); + sum = ((sum & 0xffff0000) >> 16) + (sum & 0xffff); + return (uint16_t)sum; +} + +/** + * Process the non-complemented checksum of a buffer. + * + * @param buf + * Pointer to the buffer. + * @param len + * Length of the buffer. + * @return + * The non-complemented checksum. + */ +static inline uint16_t +rte_raw_cksum(const void *buf, size_t len) +{ + uint32_t sum; + + sum = __rte_raw_cksum(buf, len, 0); + return __rte_raw_cksum_reduce(sum); +} + +/** + * Compute the raw (non complemented) checksum of a packet. + * + * @param m + * The pointer to the mbuf. + * @param off + * The offset in bytes to start the checksum. + * @param len + * The length in bytes of the data to checksum. + * @param cksum + * A pointer to the checksum, filled on success. + * @return + * 0 on success, -1 on error (bad length or offset). + */ +static inline int +rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len, + uint16_t *cksum) +{ + const struct rte_mbuf *seg; + const char *buf; + uint32_t sum, tmp; + uint32_t seglen, done; + + /* easy case: all data in the first segment */ + if (off + len <= rte_pktmbuf_data_len(m)) { + *cksum = rte_raw_cksum(rte_pktmbuf_mtod_offset(m, + const char *, off), len); + return 0; + } + + if (unlikely(off + len > rte_pktmbuf_pkt_len(m))) + return -1; /* invalid params, return a dummy value */ + + /* else browse the segment to find offset */ + seglen = 0; + for (seg = m; seg != NULL; seg = seg->next) { + seglen = rte_pktmbuf_data_len(seg); + if (off < seglen) + break; + off -= seglen; + } + seglen -= off; + buf = rte_pktmbuf_mtod_offset(seg, const char *, off); + if (seglen >= len) { + /* all in one segment */ + *cksum = rte_raw_cksum(buf, len); + return 0; + } + + /* hard case: process checksum of several segments */ + sum = 0; + done = 0; + for (;;) { + tmp = __rte_raw_cksum(buf, seglen, 0); + if (done & 1) + tmp = rte_bswap16((uint16_t)tmp); + sum += tmp; + done += seglen; + if (done == len) + break; + seg = seg->next; + buf = rte_pktmbuf_mtod(seg, const char *); + seglen = rte_pktmbuf_data_len(seg); + if (seglen > len - done) + seglen = len - done; + } + + *cksum = __rte_raw_cksum_reduce(sum); + return 0; +} + +/** + * Process the IPv4 checksum of an IPv4 header. + * + * The checksum field must be set to 0 by the caller. + * + * @param ipv4_hdr + * The pointer to the contiguous IPv4 header. + * @return + * The complemented checksum to set in the IP packet. + */ +static inline uint16_t +rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr) +{ + uint16_t cksum; + cksum = rte_raw_cksum(ipv4_hdr, sizeof(struct ipv4_hdr)); + return (cksum == 0xffff) ? cksum : (uint16_t)~cksum; +} + +/** + * Process the pseudo-header checksum of an IPv4 header. + * + * The checksum field must be set to 0 by the caller. + * + * Depending on the ol_flags, the pseudo-header checksum expected by the + * drivers is not the same. For instance, when TSO is enabled, the IP + * payload length must not be included in the packet. + * + * When ol_flags is 0, it computes the standard pseudo-header checksum. + * + * @param ipv4_hdr + * The pointer to the contiguous IPv4 header. + * @param ol_flags + * The ol_flags of the associated mbuf. + * @return + * The non-complemented checksum to set in the L4 header. + */ +static inline uint16_t +rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr, uint64_t ol_flags) +{ + struct ipv4_psd_header { + uint32_t src_addr; /* IP address of source host. */ + uint32_t dst_addr; /* IP address of destination host. */ + uint8_t zero; /* zero. */ + uint8_t proto; /* L4 protocol type. */ + uint16_t len; /* L4 length. */ + } psd_hdr; + + psd_hdr.src_addr = ipv4_hdr->src_addr; + psd_hdr.dst_addr = ipv4_hdr->dst_addr; + psd_hdr.zero = 0; + psd_hdr.proto = ipv4_hdr->next_proto_id; + if (ol_flags & PKT_TX_TCP_SEG) { + psd_hdr.len = 0; + } else { + psd_hdr.len = rte_cpu_to_be_16( + (uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) + - sizeof(struct ipv4_hdr))); + } + return rte_raw_cksum(&psd_hdr, sizeof(psd_hdr)); +} + +/** + * Process the IPv4 UDP or TCP checksum. + * + * The IPv4 header should not contains options. The IP and layer 4 + * checksum must be set to 0 in the packet by the caller. + * + * @param ipv4_hdr + * The pointer to the contiguous IPv4 header. + * @param l4_hdr + * The pointer to the beginning of the L4 header. + * @return + * The complemented checksum to set in the IP packet. + */ +static inline uint16_t +rte_ipv4_udptcp_cksum(const struct ipv4_hdr *ipv4_hdr, const void *l4_hdr) +{ + uint32_t cksum; + uint32_t l4_len; + + l4_len = (uint32_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) - + sizeof(struct ipv4_hdr)); + + cksum = rte_raw_cksum(l4_hdr, l4_len); + cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); + + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); + cksum = (~cksum) & 0xffff; + if (cksum == 0) + cksum = 0xffff; + + return (uint16_t)cksum; +} + +/** + * IPv6 Header + */ +struct ipv6_hdr { + uint32_t vtc_flow; /**< IP version, traffic class & flow label. */ + uint16_t payload_len; /**< IP packet length - includes sizeof(ip_header). */ + uint8_t proto; /**< Protocol, next header. */ + uint8_t hop_limits; /**< Hop limits. */ + uint8_t src_addr[16]; /**< IP address of source host. */ + uint8_t dst_addr[16]; /**< IP address of destination host(s). */ +} __attribute__((__packed__)); + +/* IPv6 vtc_flow: IPv / TC / flow_label */ +#define IPV6_HDR_FL_SHIFT 0 +#define IPV6_HDR_TC_SHIFT 20 +#define IPV6_HDR_FL_MASK ((1u << IPV6_HDR_TC_SHIFT) - 1) +#define IPV6_HDR_TC_MASK (0xf << IPV6_HDR_TC_SHIFT) + +/** + * Process the pseudo-header checksum of an IPv6 header. + * + * Depending on the ol_flags, the pseudo-header checksum expected by the + * drivers is not the same. For instance, when TSO is enabled, the IPv6 + * payload length must not be included in the packet. + * + * When ol_flags is 0, it computes the standard pseudo-header checksum. + * + * @param ipv6_hdr + * The pointer to the contiguous IPv6 header. + * @param ol_flags + * The ol_flags of the associated mbuf. + * @return + * The non-complemented checksum to set in the L4 header. + */ +static inline uint16_t +rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr, uint64_t ol_flags) +{ + uint32_t sum; + struct { + uint32_t len; /* L4 length. */ + uint32_t proto; /* L4 protocol - top 3 bytes must be zero */ + } psd_hdr; + + psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24); + if (ol_flags & PKT_TX_TCP_SEG) { + psd_hdr.len = 0; + } else { + psd_hdr.len = ipv6_hdr->payload_len; + } + + sum = __rte_raw_cksum(ipv6_hdr->src_addr, + sizeof(ipv6_hdr->src_addr) + sizeof(ipv6_hdr->dst_addr), + 0); + sum = __rte_raw_cksum(&psd_hdr, sizeof(psd_hdr), sum); + return __rte_raw_cksum_reduce(sum); +} + +/** + * Process the IPv6 UDP or TCP checksum. + * + * The IPv4 header should not contains options. The layer 4 checksum + * must be set to 0 in the packet by the caller. + * + * @param ipv6_hdr + * The pointer to the contiguous IPv6 header. + * @param l4_hdr + * The pointer to the beginning of the L4 header. + * @return + * The complemented checksum to set in the IP packet. + */ +static inline uint16_t +rte_ipv6_udptcp_cksum(const struct ipv6_hdr *ipv6_hdr, const void *l4_hdr) +{ + uint32_t cksum; + uint32_t l4_len; + + l4_len = rte_be_to_cpu_16(ipv6_hdr->payload_len); + + cksum = rte_raw_cksum(l4_hdr, l4_len); + cksum += rte_ipv6_phdr_cksum(ipv6_hdr, 0); + + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); + cksum = (~cksum) & 0xffff; + if (cksum == 0) + cksum = 0xffff; + + return (uint16_t)cksum; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_IP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_net.c b/src/spdk/dpdk/lib/librte_net/rte_net.c new file mode 100644 index 00000000..9eb7c743 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_net.c @@ -0,0 +1,496 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* get l3 packet type from ip6 next protocol */ +static uint32_t +ptype_l3_ip6(uint8_t ip6_proto) +{ + static const uint32_t ip6_ext_proto_map[256] = { + [IPPROTO_HOPOPTS] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6, + [IPPROTO_ROUTING] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6, + [IPPROTO_FRAGMENT] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6, + [IPPROTO_ESP] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6, + [IPPROTO_AH] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6, + [IPPROTO_DSTOPTS] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6, + }; + + return RTE_PTYPE_L3_IPV6 + ip6_ext_proto_map[ip6_proto]; +} + +/* get l3 packet type from ip version and header length */ +static uint32_t +ptype_l3_ip(uint8_t ipv_ihl) +{ + static const uint32_t ptype_l3_ip_proto_map[256] = { + [0x45] = RTE_PTYPE_L3_IPV4, + [0x46] = RTE_PTYPE_L3_IPV4_EXT, + [0x47] = RTE_PTYPE_L3_IPV4_EXT, + [0x48] = RTE_PTYPE_L3_IPV4_EXT, + [0x49] = RTE_PTYPE_L3_IPV4_EXT, + [0x4A] = RTE_PTYPE_L3_IPV4_EXT, + [0x4B] = RTE_PTYPE_L3_IPV4_EXT, + [0x4C] = RTE_PTYPE_L3_IPV4_EXT, + [0x4D] = RTE_PTYPE_L3_IPV4_EXT, + [0x4E] = RTE_PTYPE_L3_IPV4_EXT, + [0x4F] = RTE_PTYPE_L3_IPV4_EXT, + }; + + return ptype_l3_ip_proto_map[ipv_ihl]; +} + +/* get l4 packet type from proto */ +static uint32_t +ptype_l4(uint8_t proto) +{ + static const uint32_t ptype_l4_proto[256] = { + [IPPROTO_UDP] = RTE_PTYPE_L4_UDP, + [IPPROTO_TCP] = RTE_PTYPE_L4_TCP, + [IPPROTO_SCTP] = RTE_PTYPE_L4_SCTP, + }; + + return ptype_l4_proto[proto]; +} + +/* get inner l3 packet type from ip6 next protocol */ +static uint32_t +ptype_inner_l3_ip6(uint8_t ip6_proto) +{ + static const uint32_t ptype_inner_ip6_ext_proto_map[256] = { + [IPPROTO_HOPOPTS] = RTE_PTYPE_INNER_L3_IPV6_EXT - + RTE_PTYPE_INNER_L3_IPV6, + [IPPROTO_ROUTING] = RTE_PTYPE_INNER_L3_IPV6_EXT - + RTE_PTYPE_INNER_L3_IPV6, + [IPPROTO_FRAGMENT] = RTE_PTYPE_INNER_L3_IPV6_EXT - + RTE_PTYPE_INNER_L3_IPV6, + [IPPROTO_ESP] = RTE_PTYPE_INNER_L3_IPV6_EXT - + RTE_PTYPE_INNER_L3_IPV6, + [IPPROTO_AH] = RTE_PTYPE_INNER_L3_IPV6_EXT - + RTE_PTYPE_INNER_L3_IPV6, + [IPPROTO_DSTOPTS] = RTE_PTYPE_INNER_L3_IPV6_EXT - + RTE_PTYPE_INNER_L3_IPV6, + }; + + return RTE_PTYPE_INNER_L3_IPV6 + + ptype_inner_ip6_ext_proto_map[ip6_proto]; +} + +/* get inner l3 packet type from ip version and header length */ +static uint32_t +ptype_inner_l3_ip(uint8_t ipv_ihl) +{ + static const uint32_t ptype_inner_l3_ip_proto_map[256] = { + [0x45] = RTE_PTYPE_INNER_L3_IPV4, + [0x46] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x47] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x48] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x49] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x4A] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x4B] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x4C] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x4D] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x4E] = RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x4F] = RTE_PTYPE_INNER_L3_IPV4_EXT, + }; + + return ptype_inner_l3_ip_proto_map[ipv_ihl]; +} + +/* get inner l4 packet type from proto */ +static uint32_t +ptype_inner_l4(uint8_t proto) +{ + static const uint32_t ptype_inner_l4_proto[256] = { + [IPPROTO_UDP] = RTE_PTYPE_INNER_L4_UDP, + [IPPROTO_TCP] = RTE_PTYPE_INNER_L4_TCP, + [IPPROTO_SCTP] = RTE_PTYPE_INNER_L4_SCTP, + }; + + return ptype_inner_l4_proto[proto]; +} + +/* get the tunnel packet type if any, update proto and off. */ +static uint32_t +ptype_tunnel(uint16_t *proto, const struct rte_mbuf *m, + uint32_t *off) +{ + switch (*proto) { + case IPPROTO_GRE: { + static const uint8_t opt_len[16] = { + [0x0] = 4, + [0x1] = 8, + [0x2] = 8, + [0x8] = 8, + [0x3] = 12, + [0x9] = 12, + [0xa] = 12, + [0xb] = 16, + }; + const struct gre_hdr *gh; + struct gre_hdr gh_copy; + uint16_t flags; + + gh = rte_pktmbuf_read(m, *off, sizeof(*gh), &gh_copy); + if (unlikely(gh == NULL)) + return 0; + + flags = rte_be_to_cpu_16(*(const uint16_t *)gh); + flags >>= 12; + if (opt_len[flags] == 0) + return 0; + + *off += opt_len[flags]; + *proto = gh->proto; + if (*proto == rte_cpu_to_be_16(ETHER_TYPE_TEB)) + return RTE_PTYPE_TUNNEL_NVGRE; + else + return RTE_PTYPE_TUNNEL_GRE; + } + case IPPROTO_IPIP: + *proto = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + return RTE_PTYPE_TUNNEL_IP; + case IPPROTO_IPV6: + *proto = rte_cpu_to_be_16(ETHER_TYPE_IPv6); + return RTE_PTYPE_TUNNEL_IP; /* IP is also valid for IPv6 */ + default: + return 0; + } +} + +/* get the ipv4 header length */ +static uint8_t +ip4_hlen(const struct ipv4_hdr *hdr) +{ + return (hdr->version_ihl & 0xf) * 4; +} + +/* parse ipv6 extended headers, update offset and return next proto */ +int __rte_experimental +rte_net_skip_ip6_ext(uint16_t proto, const struct rte_mbuf *m, uint32_t *off, + int *frag) +{ + struct ext_hdr { + uint8_t next_hdr; + uint8_t len; + }; + const struct ext_hdr *xh; + struct ext_hdr xh_copy; + unsigned int i; + + *frag = 0; + +#define MAX_EXT_HDRS 5 + for (i = 0; i < MAX_EXT_HDRS; i++) { + switch (proto) { + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + xh = rte_pktmbuf_read(m, *off, sizeof(*xh), + &xh_copy); + if (xh == NULL) + return -1; + *off += (xh->len + 1) * 8; + proto = xh->next_hdr; + break; + case IPPROTO_FRAGMENT: + xh = rte_pktmbuf_read(m, *off, sizeof(*xh), + &xh_copy); + if (xh == NULL) + return -1; + *off += 8; + proto = xh->next_hdr; + *frag = 1; + return proto; /* this is always the last ext hdr */ + case IPPROTO_NONE: + return 0; + default: + return proto; + } + } + return -1; +} + +/* parse mbuf data to get packet type */ +uint32_t rte_net_get_ptype(const struct rte_mbuf *m, + struct rte_net_hdr_lens *hdr_lens, uint32_t layers) +{ + struct rte_net_hdr_lens local_hdr_lens; + const struct ether_hdr *eh; + struct ether_hdr eh_copy; + uint32_t pkt_type = RTE_PTYPE_L2_ETHER; + uint32_t off = 0; + uint16_t proto; + int ret; + + if (hdr_lens == NULL) + hdr_lens = &local_hdr_lens; + + eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy); + if (unlikely(eh == NULL)) + return 0; + proto = eh->ether_type; + off = sizeof(*eh); + hdr_lens->l2_len = off; + + if ((layers & RTE_PTYPE_L2_MASK) == 0) + return 0; + + if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) + goto l3; /* fast path if packet is IPv4 */ + + if (proto == rte_cpu_to_be_16(ETHER_TYPE_VLAN)) { + const struct vlan_hdr *vh; + struct vlan_hdr vh_copy; + + pkt_type = RTE_PTYPE_L2_ETHER_VLAN; + vh = rte_pktmbuf_read(m, off, sizeof(*vh), &vh_copy); + if (unlikely(vh == NULL)) + return pkt_type; + off += sizeof(*vh); + hdr_lens->l2_len += sizeof(*vh); + proto = vh->eth_proto; + } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_QINQ)) { + const struct vlan_hdr *vh; + struct vlan_hdr vh_copy; + + pkt_type = RTE_PTYPE_L2_ETHER_QINQ; + vh = rte_pktmbuf_read(m, off + sizeof(*vh), sizeof(*vh), + &vh_copy); + if (unlikely(vh == NULL)) + return pkt_type; + off += 2 * sizeof(*vh); + hdr_lens->l2_len += 2 * sizeof(*vh); + proto = vh->eth_proto; + } + + l3: + if ((layers & RTE_PTYPE_L3_MASK) == 0) + return pkt_type; + + if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { + const struct ipv4_hdr *ip4h; + struct ipv4_hdr ip4h_copy; + + ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy); + if (unlikely(ip4h == NULL)) + return pkt_type; + + pkt_type |= ptype_l3_ip(ip4h->version_ihl); + hdr_lens->l3_len = ip4_hlen(ip4h); + off += hdr_lens->l3_len; + + if ((layers & RTE_PTYPE_L4_MASK) == 0) + return pkt_type; + + if (ip4h->fragment_offset & rte_cpu_to_be_16( + IPV4_HDR_OFFSET_MASK | IPV4_HDR_MF_FLAG)) { + pkt_type |= RTE_PTYPE_L4_FRAG; + hdr_lens->l4_len = 0; + return pkt_type; + } + proto = ip4h->next_proto_id; + pkt_type |= ptype_l4(proto); + } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) { + const struct ipv6_hdr *ip6h; + struct ipv6_hdr ip6h_copy; + int frag = 0; + + ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy); + if (unlikely(ip6h == NULL)) + return pkt_type; + + proto = ip6h->proto; + hdr_lens->l3_len = sizeof(*ip6h); + off += hdr_lens->l3_len; + pkt_type |= ptype_l3_ip6(proto); + if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) { + ret = rte_net_skip_ip6_ext(proto, m, &off, &frag); + if (ret < 0) + return pkt_type; + proto = ret; + hdr_lens->l3_len = off - hdr_lens->l2_len; + } + if (proto == 0) + return pkt_type; + + if ((layers & RTE_PTYPE_L4_MASK) == 0) + return pkt_type; + + if (frag) { + pkt_type |= RTE_PTYPE_L4_FRAG; + hdr_lens->l4_len = 0; + return pkt_type; + } + pkt_type |= ptype_l4(proto); + } + + if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) { + hdr_lens->l4_len = sizeof(struct udp_hdr); + return pkt_type; + } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) { + const struct tcp_hdr *th; + struct tcp_hdr th_copy; + + th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy); + if (unlikely(th == NULL)) + return pkt_type & (RTE_PTYPE_L2_MASK | + RTE_PTYPE_L3_MASK); + hdr_lens->l4_len = (th->data_off & 0xf0) >> 2; + return pkt_type; + } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) { + hdr_lens->l4_len = sizeof(struct sctp_hdr); + return pkt_type; + } else { + uint32_t prev_off = off; + + hdr_lens->l4_len = 0; + + if ((layers & RTE_PTYPE_TUNNEL_MASK) == 0) + return pkt_type; + + pkt_type |= ptype_tunnel(&proto, m, &off); + hdr_lens->tunnel_len = off - prev_off; + } + + /* same job for inner header: we need to duplicate the code + * because the packet types do not have the same value. + */ + if ((layers & RTE_PTYPE_INNER_L2_MASK) == 0) + return pkt_type; + + hdr_lens->inner_l2_len = 0; + if (proto == rte_cpu_to_be_16(ETHER_TYPE_TEB)) { + eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy); + if (unlikely(eh == NULL)) + return pkt_type; + pkt_type |= RTE_PTYPE_INNER_L2_ETHER; + proto = eh->ether_type; + off += sizeof(*eh); + hdr_lens->inner_l2_len = sizeof(*eh); + } + + if (proto == rte_cpu_to_be_16(ETHER_TYPE_VLAN)) { + const struct vlan_hdr *vh; + struct vlan_hdr vh_copy; + + pkt_type &= ~RTE_PTYPE_INNER_L2_MASK; + pkt_type |= RTE_PTYPE_INNER_L2_ETHER_VLAN; + vh = rte_pktmbuf_read(m, off, sizeof(*vh), &vh_copy); + if (unlikely(vh == NULL)) + return pkt_type; + off += sizeof(*vh); + hdr_lens->inner_l2_len += sizeof(*vh); + proto = vh->eth_proto; + } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_QINQ)) { + const struct vlan_hdr *vh; + struct vlan_hdr vh_copy; + + pkt_type &= ~RTE_PTYPE_INNER_L2_MASK; + pkt_type |= RTE_PTYPE_INNER_L2_ETHER_QINQ; + vh = rte_pktmbuf_read(m, off + sizeof(*vh), sizeof(*vh), + &vh_copy); + if (unlikely(vh == NULL)) + return pkt_type; + off += 2 * sizeof(*vh); + hdr_lens->inner_l2_len += 2 * sizeof(*vh); + proto = vh->eth_proto; + } + + if ((layers & RTE_PTYPE_INNER_L3_MASK) == 0) + return pkt_type; + + if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { + const struct ipv4_hdr *ip4h; + struct ipv4_hdr ip4h_copy; + + ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy); + if (unlikely(ip4h == NULL)) + return pkt_type; + + pkt_type |= ptype_inner_l3_ip(ip4h->version_ihl); + hdr_lens->inner_l3_len = ip4_hlen(ip4h); + off += hdr_lens->inner_l3_len; + + if ((layers & RTE_PTYPE_INNER_L4_MASK) == 0) + return pkt_type; + if (ip4h->fragment_offset & + rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK | + IPV4_HDR_MF_FLAG)) { + pkt_type |= RTE_PTYPE_INNER_L4_FRAG; + hdr_lens->inner_l4_len = 0; + return pkt_type; + } + proto = ip4h->next_proto_id; + pkt_type |= ptype_inner_l4(proto); + } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) { + const struct ipv6_hdr *ip6h; + struct ipv6_hdr ip6h_copy; + int frag = 0; + + ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy); + if (unlikely(ip6h == NULL)) + return pkt_type; + + proto = ip6h->proto; + hdr_lens->inner_l3_len = sizeof(*ip6h); + off += hdr_lens->inner_l3_len; + pkt_type |= ptype_inner_l3_ip6(proto); + if ((pkt_type & RTE_PTYPE_INNER_L3_MASK) == + RTE_PTYPE_INNER_L3_IPV6_EXT) { + uint32_t prev_off; + + prev_off = off; + ret = rte_net_skip_ip6_ext(proto, m, &off, &frag); + if (ret < 0) + return pkt_type; + proto = ret; + hdr_lens->inner_l3_len += off - prev_off; + } + if (proto == 0) + return pkt_type; + + if ((layers & RTE_PTYPE_INNER_L4_MASK) == 0) + return pkt_type; + + if (frag) { + pkt_type |= RTE_PTYPE_INNER_L4_FRAG; + hdr_lens->inner_l4_len = 0; + return pkt_type; + } + pkt_type |= ptype_inner_l4(proto); + } + + if ((pkt_type & RTE_PTYPE_INNER_L4_MASK) == RTE_PTYPE_INNER_L4_UDP) { + hdr_lens->inner_l4_len = sizeof(struct udp_hdr); + } else if ((pkt_type & RTE_PTYPE_INNER_L4_MASK) == + RTE_PTYPE_INNER_L4_TCP) { + const struct tcp_hdr *th; + struct tcp_hdr th_copy; + + th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy); + if (unlikely(th == NULL)) + return pkt_type & (RTE_PTYPE_INNER_L2_MASK | + RTE_PTYPE_INNER_L3_MASK); + hdr_lens->inner_l4_len = (th->data_off & 0xf0) >> 2; + } else if ((pkt_type & RTE_PTYPE_INNER_L4_MASK) == + RTE_PTYPE_INNER_L4_SCTP) { + hdr_lens->inner_l4_len = sizeof(struct sctp_hdr); + } else { + hdr_lens->inner_l4_len = 0; + } + + return pkt_type; +} diff --git a/src/spdk/dpdk/lib/librte_net/rte_net.h b/src/spdk/dpdk/lib/librte_net/rte_net.h new file mode 100644 index 00000000..b6ab6e1d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_net.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 6WIND S.A. + */ + +#ifndef _RTE_NET_PTYPE_H_ +#define _RTE_NET_PTYPE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/** + * Structure containing header lengths associated to a packet, filled + * by rte_net_get_ptype(). + */ +struct rte_net_hdr_lens { + uint8_t l2_len; + uint8_t l3_len; + uint8_t l4_len; + uint8_t tunnel_len; + uint8_t inner_l2_len; + uint8_t inner_l3_len; + uint8_t inner_l4_len; +}; + +/** + * Skip IPv6 header extensions. + * + * This function skips all IPv6 extensions, returning size of + * complete header including options and final protocol value. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * @param proto + * Protocol field of IPv6 header. + * @param m + * The packet mbuf to be parsed. + * @param off + * On input, must contain the offset to the first byte following + * IPv6 header, on output, contains offset to the first byte + * of next layer (after any IPv6 extension header) + * @param frag + * Contains 1 in output if packet is an IPv6 fragment. + * @return + * Protocol that follows IPv6 header. + * -1 if an error occurs during mbuf parsing. + */ +int __rte_experimental +rte_net_skip_ip6_ext(uint16_t proto, const struct rte_mbuf *m, uint32_t *off, + int *frag); + +/** + * Parse an Ethernet packet to get its packet type. + * + * This function parses the network headers in mbuf data and return its + * packet type. + * + * If it is provided by the user, it also fills a rte_net_hdr_lens + * structure that contains the lengths of the parsed network + * headers. Each length field is valid only if the associated packet + * type is set. For instance, hdr_lens->l2_len is valid only if + * (retval & RTE_PTYPE_L2_MASK) != RTE_PTYPE_UNKNOWN. + * + * Supported packet types are: + * L2: Ether, Vlan, QinQ + * L3: IPv4, IPv6 + * L4: TCP, UDP, SCTP + * Tunnels: IPv4, IPv6, Gre, Nvgre + * + * @param m + * The packet mbuf to be parsed. + * @param hdr_lens + * A pointer to a structure where the header lengths will be returned, + * or NULL. + * @param layers + * List of layers to parse. The function will stop at the first + * empty layer. Examples: + * - To parse all known layers, use RTE_PTYPE_ALL_MASK. + * - To parse only L2 and L3, use RTE_PTYPE_L2_MASK | RTE_PTYPE_L3_MASK + * @return + * The packet type of the packet. + */ +uint32_t rte_net_get_ptype(const struct rte_mbuf *m, + struct rte_net_hdr_lens *hdr_lens, uint32_t layers); + +/** + * Prepare pseudo header checksum + * + * This function prepares pseudo header checksum for TSO and non-TSO tcp/udp in + * provided mbufs packet data and based on the requested offload flags. + * + * - for non-TSO tcp/udp packets full pseudo-header checksum is counted and set + * in packet data, + * - for TSO the IP payload length is not included in pseudo header. + * + * This function expects that used headers are in the first data segment of + * mbuf, are not fragmented and can be safely modified. + * + * @param m + * The packet mbuf to be fixed. + * @param ol_flags + * TX offloads flags to use with this packet. + * @return + * 0 if checksum is initialized properly + */ +static inline int +rte_net_intel_cksum_flags_prepare(struct rte_mbuf *m, uint64_t ol_flags) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + struct tcp_hdr *tcp_hdr; + struct udp_hdr *udp_hdr; + uint64_t inner_l3_offset = m->l2_len; + + if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) || + (ol_flags & PKT_TX_OUTER_IPV6)) + inner_l3_offset += m->outer_l2_len + m->outer_l3_len; + + if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) { + if (ol_flags & PKT_TX_IPV4) { + ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, + inner_l3_offset); + + if (ol_flags & PKT_TX_IP_CKSUM) + ipv4_hdr->hdr_checksum = 0; + + udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr + + m->l3_len); + udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr, + ol_flags); + } else { + ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, + inner_l3_offset); + /* non-TSO udp */ + udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, + inner_l3_offset + m->l3_len); + udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr, + ol_flags); + } + } else if ((ol_flags & PKT_TX_TCP_CKSUM) || + (ol_flags & PKT_TX_TCP_SEG)) { + if (ol_flags & PKT_TX_IPV4) { + ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, + inner_l3_offset); + + if (ol_flags & PKT_TX_IP_CKSUM) + ipv4_hdr->hdr_checksum = 0; + + /* non-TSO tcp or TSO */ + tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + + m->l3_len); + tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr, + ol_flags); + } else { + ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, + inner_l3_offset); + /* non-TSO tcp or TSO */ + tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, + inner_l3_offset + m->l3_len); + tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr, + ol_flags); + } + } + + return 0; +} + +/** + * Prepare pseudo header checksum + * + * This function prepares pseudo header checksum for TSO and non-TSO tcp/udp in + * provided mbufs packet data. + * + * - for non-TSO tcp/udp packets full pseudo-header checksum is counted and set + * in packet data, + * - for TSO the IP payload length is not included in pseudo header. + * + * This function expects that used headers are in the first data segment of + * mbuf, are not fragmented and can be safely modified. + * + * @param m + * The packet mbuf to be fixed. + * @return + * 0 if checksum is initialized properly + */ +static inline int +rte_net_intel_cksum_prepare(struct rte_mbuf *m) +{ + return rte_net_intel_cksum_flags_prepare(m, m->ol_flags); +} + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_NET_PTYPE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_net_crc.c b/src/spdk/dpdk/lib/librte_net/rte_net_crc.c new file mode 100644 index 00000000..73ac3a95 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_net_crc.c @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include +#include +#include + +#include +#include +#include + +#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) +#define X86_64_SSE42_PCLMULQDQ 1 +#elif defined(RTE_ARCH_ARM64) && defined(RTE_MACHINE_CPUFLAG_PMULL) +#define ARM64_NEON_PMULL 1 +#endif + +#ifdef X86_64_SSE42_PCLMULQDQ +#include +#elif defined ARM64_NEON_PMULL +#include +#endif + +/* crc tables */ +static uint32_t crc32_eth_lut[CRC_LUT_SIZE]; +static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE]; + +static uint32_t +rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len); + +static uint32_t +rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len); + +typedef uint32_t +(*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len); + +static rte_net_crc_handler *handlers; + +static rte_net_crc_handler handlers_scalar[] = { + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler, + [RTE_NET_CRC32_ETH] = rte_crc32_eth_handler, +}; + +#ifdef X86_64_SSE42_PCLMULQDQ +static rte_net_crc_handler handlers_sse42[] = { + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler, + [RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler, +}; +#elif defined ARM64_NEON_PMULL +static rte_net_crc_handler handlers_neon[] = { + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler, + [RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler, +}; +#endif + +/** + * Reflect the bits about the middle + * + * @param val + * value to be reflected + * + * @return + * reflected value + */ +static uint32_t +reflect_32bits(uint32_t val) +{ + uint32_t i, res = 0; + + for (i = 0; i < 32; i++) + if ((val & (1 << i)) != 0) + res |= (uint32_t)(1 << (31 - i)); + + return res; +} + +static void +crc32_eth_init_lut(uint32_t poly, + uint32_t *lut) +{ + uint32_t i, j; + + for (i = 0; i < CRC_LUT_SIZE; i++) { + uint32_t crc = reflect_32bits(i); + + for (j = 0; j < 8; j++) { + if (crc & 0x80000000L) + crc = (crc << 1) ^ poly; + else + crc <<= 1; + } + lut[i] = reflect_32bits(crc); + } +} + +static __rte_always_inline uint32_t +crc32_eth_calc_lut(const uint8_t *data, + uint32_t data_len, + uint32_t crc, + const uint32_t *lut) +{ + while (data_len--) + crc = lut[(crc ^ *data++) & 0xffL] ^ (crc >> 8); + + return crc; +} + +static void +rte_net_crc_scalar_init(void) +{ + /* 32-bit crc init */ + crc32_eth_init_lut(CRC32_ETH_POLYNOMIAL, crc32_eth_lut); + + /* 16-bit CRC init */ + crc32_eth_init_lut(CRC16_CCITT_POLYNOMIAL << 16, crc16_ccitt_lut); +} + +static inline uint32_t +rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len) +{ + /* return 16-bit CRC value */ + return (uint16_t)~crc32_eth_calc_lut(data, + data_len, + 0xffff, + crc16_ccitt_lut); +} + +static inline uint32_t +rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len) +{ + /* return 32-bit CRC value */ + return ~crc32_eth_calc_lut(data, + data_len, + 0xffffffffUL, + crc32_eth_lut); +} + +void +rte_net_crc_set_alg(enum rte_net_crc_alg alg) +{ + switch (alg) { +#ifdef X86_64_SSE42_PCLMULQDQ + case RTE_NET_CRC_SSE42: + handlers = handlers_sse42; + break; +#elif defined ARM64_NEON_PMULL + /* fall-through */ + case RTE_NET_CRC_NEON: + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) { + handlers = handlers_neon; + break; + } +#endif + /* fall-through */ + case RTE_NET_CRC_SCALAR: + /* fall-through */ + default: + handlers = handlers_scalar; + break; + } +} + +uint32_t +rte_net_crc_calc(const void *data, + uint32_t data_len, + enum rte_net_crc_type type) +{ + uint32_t ret; + rte_net_crc_handler f_handle; + + f_handle = handlers[type]; + ret = f_handle(data, data_len); + + return ret; +} + +/* Select highest available crc algorithm as default one */ +RTE_INIT(rte_net_crc_init) +{ + enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR; + + rte_net_crc_scalar_init(); + +#ifdef X86_64_SSE42_PCLMULQDQ + alg = RTE_NET_CRC_SSE42; + rte_net_crc_sse42_init(); +#elif defined ARM64_NEON_PMULL + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) { + alg = RTE_NET_CRC_NEON; + rte_net_crc_neon_init(); + } +#endif + + rte_net_crc_set_alg(alg); +} diff --git a/src/spdk/dpdk/lib/librte_net/rte_net_crc.h b/src/spdk/dpdk/lib/librte_net/rte_net_crc.h new file mode 100644 index 00000000..8a86f297 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_net_crc.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#ifndef _RTE_NET_CRC_H_ +#define _RTE_NET_CRC_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** CRC polynomials */ +#define CRC32_ETH_POLYNOMIAL 0x04c11db7UL +#define CRC16_CCITT_POLYNOMIAL 0x1021U + +#define CRC_LUT_SIZE 256 + +/** CRC types */ +enum rte_net_crc_type { + RTE_NET_CRC16_CCITT = 0, + RTE_NET_CRC32_ETH, + RTE_NET_CRC_REQS +}; + +/** CRC compute algorithm */ +enum rte_net_crc_alg { + RTE_NET_CRC_SCALAR = 0, + RTE_NET_CRC_SSE42, + RTE_NET_CRC_NEON, +}; + +/** + * This API set the CRC computation algorithm (i.e. scalar version, + * x86 64-bit sse4.2 intrinsic version, etc.) and internal data + * structure. + * + * @param alg + * This parameter is used to select the CRC implementation version. + * - RTE_NET_CRC_SCALAR + * - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic) + * - RTE_NET_CRC_NEON (Use ARM Neon intrinsic) + */ +void +rte_net_crc_set_alg(enum rte_net_crc_alg alg); + +/** + * CRC compute API + * + * @param data + * Pointer to the packet data for CRC computation + * @param data_len + * Data length for CRC computation + * @param type + * CRC type (enum rte_net_crc_type) + * + * @return + * CRC value + */ +uint32_t +rte_net_crc_calc(const void *data, + uint32_t data_len, + enum rte_net_crc_type type); + +#ifdef __cplusplus +} +#endif + + +#endif /* _RTE_NET_CRC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_net_version.map b/src/spdk/dpdk/lib/librte_net/rte_net_version.map new file mode 100644 index 00000000..26c06e7c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_net_version.map @@ -0,0 +1,21 @@ +DPDK_16.11 { + global: + rte_net_get_ptype; + + local: *; +}; + +DPDK_17.05 { + global: + + rte_net_crc_calc; + rte_net_crc_set_alg; + +} DPDK_16.11; + +EXPERIMENTAL { + global: + + rte_net_make_rarp_packet; + rte_net_skip_ip6_ext; +}; diff --git a/src/spdk/dpdk/lib/librte_net/rte_sctp.h b/src/spdk/dpdk/lib/librte_net/rte_sctp.h new file mode 100644 index 00000000..bfb7165a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_sctp.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2010-2014 Intel Corporation. + * All rights reserved. + */ + +/** + * @file + * + * SCTP-related defines + */ + +#ifndef _RTE_SCTP_H_ +#define _RTE_SCTP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * SCTP Header + */ +struct sctp_hdr { + uint16_t src_port; /**< Source port. */ + uint16_t dst_port; /**< Destin port. */ + uint32_t tag; /**< Validation tag. */ + uint32_t cksum; /**< Checksum. */ +} __attribute__((__packed__)); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_SCTP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_tcp.h b/src/spdk/dpdk/lib/librte_net/rte_tcp.h new file mode 100644 index 00000000..91f58987 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_tcp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2010-2014 Intel Corporation. + * All rights reserved. + */ + +#ifndef _RTE_TCP_H_ +#define _RTE_TCP_H_ + +/** + * @file + * + * TCP-related defines + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * TCP Header + */ +struct tcp_hdr { + uint16_t src_port; /**< TCP source port. */ + uint16_t dst_port; /**< TCP destination port. */ + uint32_t sent_seq; /**< TX data sequence number. */ + uint32_t recv_ack; /**< RX data acknowledgement sequence number. */ + uint8_t data_off; /**< Data offset. */ + uint8_t tcp_flags; /**< TCP flags */ + uint16_t rx_win; /**< RX flow control window. */ + uint16_t cksum; /**< TCP checksum. */ + uint16_t tcp_urp; /**< TCP urgent pointer, if any. */ +} __attribute__((__packed__)); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_TCP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_net/rte_udp.h b/src/spdk/dpdk/lib/librte_net/rte_udp.h new file mode 100644 index 00000000..ba033955 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_net/rte_udp.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * Copyright(c) 2010-2014 Intel Corporation. + * All rights reserved. + */ + +#ifndef _RTE_UDP_H_ +#define _RTE_UDP_H_ + +/** + * @file + * + * UDP-related defines + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * UDP Header + */ +struct udp_hdr { + uint16_t src_port; /**< UDP source port. */ + uint16_t dst_port; /**< UDP destination port. */ + uint16_t dgram_len; /**< UDP datagram length */ + uint16_t dgram_cksum; /**< UDP datagram checksum */ +} __attribute__((__packed__)); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_UDP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_pci/Makefile b/src/spdk/dpdk/lib/librte_pci/Makefile new file mode 100644 index 00000000..94a63267 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pci/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 6WIND S.A. + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_pci.a + +CFLAGS := -I$(SRCDIR) $(CFLAGS) +CFLAGS += $(WERROR_FLAGS) -O3 +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_pci_version.map + +LIBABIVER := 1 + +SRCS-$(CONFIG_RTE_LIBRTE_PCI) += rte_pci.c + +SYMLINK-$(CONFIG_RTE_LIBRTE_PCI)-include += rte_pci.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_pci/meson.build b/src/spdk/dpdk/lib/librte_pci/meson.build new file mode 100644 index 00000000..dd41cd50 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pci/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_pci.c') +headers = files('rte_pci.h') diff --git a/src/spdk/dpdk/lib/librte_pci/rte_pci.c b/src/spdk/dpdk/lib/librte_pci/rte_pci.c new file mode 100644 index 00000000..530738db --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pci/rte_pci.c @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright 2013-2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_pci.h" + +static inline const char * +get_u8_pciaddr_field(const char *in, void *_u8, char dlm) +{ + unsigned long val; + uint8_t *u8 = _u8; + char *end; + + errno = 0; + val = strtoul(in, &end, 16); + if (errno != 0 || end[0] != dlm || val > UINT8_MAX) { + errno = errno ? errno : EINVAL; + return NULL; + } + *u8 = (uint8_t)val; + return end + 1; +} + +static int +pci_bdf_parse(const char *input, struct rte_pci_addr *dev_addr) +{ + const char *in = input; + + dev_addr->domain = 0; + in = get_u8_pciaddr_field(in, &dev_addr->bus, ':'); + if (in == NULL) + return -EINVAL; + in = get_u8_pciaddr_field(in, &dev_addr->devid, '.'); + if (in == NULL) + return -EINVAL; + in = get_u8_pciaddr_field(in, &dev_addr->function, '\0'); + if (in == NULL) + return -EINVAL; + return 0; +} + +static int +pci_dbdf_parse(const char *input, struct rte_pci_addr *dev_addr) +{ + const char *in = input; + unsigned long val; + char *end; + + errno = 0; + val = strtoul(in, &end, 16); + if (errno != 0 || end[0] != ':' || val > UINT16_MAX) + return -EINVAL; + dev_addr->domain = (uint16_t)val; + in = end + 1; + in = get_u8_pciaddr_field(in, &dev_addr->bus, ':'); + if (in == NULL) + return -EINVAL; + in = get_u8_pciaddr_field(in, &dev_addr->devid, '.'); + if (in == NULL) + return -EINVAL; + in = get_u8_pciaddr_field(in, &dev_addr->function, '\0'); + if (in == NULL) + return -EINVAL; + return 0; +} + +int +eal_parse_pci_BDF(const char *input, struct rte_pci_addr *dev_addr) +{ + return pci_bdf_parse(input, dev_addr); +} + +int +eal_parse_pci_DomBDF(const char *input, struct rte_pci_addr *dev_addr) +{ + return pci_dbdf_parse(input, dev_addr); +} + +void +rte_pci_device_name(const struct rte_pci_addr *addr, + char *output, size_t size) +{ + RTE_VERIFY(size >= PCI_PRI_STR_SIZE); + RTE_VERIFY(snprintf(output, size, PCI_PRI_FMT, + addr->domain, addr->bus, + addr->devid, addr->function) >= 0); +} + +int +rte_eal_compare_pci_addr(const struct rte_pci_addr *addr, + const struct rte_pci_addr *addr2) +{ + return rte_pci_addr_cmp(addr, addr2); +} + +int +rte_pci_addr_cmp(const struct rte_pci_addr *addr, + const struct rte_pci_addr *addr2) +{ + uint64_t dev_addr, dev_addr2; + + if ((addr == NULL) || (addr2 == NULL)) + return -1; + + dev_addr = ((uint64_t)addr->domain << 24) | + (addr->bus << 16) | (addr->devid << 8) | addr->function; + dev_addr2 = ((uint64_t)addr2->domain << 24) | + (addr2->bus << 16) | (addr2->devid << 8) | addr2->function; + + if (dev_addr > dev_addr2) + return 1; + else if (dev_addr < dev_addr2) + return -1; + else + return 0; +} + +int +rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr) +{ + if (pci_bdf_parse(str, addr) == 0 || + pci_dbdf_parse(str, addr) == 0) + return 0; + return -1; +} + + +/* map a particular resource from a file */ +void * +pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size, + int additional_flags) +{ + void *mapaddr; + + /* Map the PCI memory resource of device */ + mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE, + MAP_SHARED | additional_flags, fd, offset); + if (mapaddr == MAP_FAILED) { + RTE_LOG(ERR, EAL, + "%s(): cannot mmap(%d, %p, 0x%zx, 0x%llx): %s (%p)\n", + __func__, fd, requested_addr, size, + (unsigned long long)offset, + strerror(errno), mapaddr); + } else + RTE_LOG(DEBUG, EAL, " PCI memory mapped at %p\n", mapaddr); + + return mapaddr; +} + +/* unmap a particular resource */ +void +pci_unmap_resource(void *requested_addr, size_t size) +{ + if (requested_addr == NULL) + return; + + /* Unmap the PCI memory resource of device */ + if (munmap(requested_addr, size)) { + RTE_LOG(ERR, EAL, "%s(): cannot munmap(%p, %#zx): %s\n", + __func__, requested_addr, size, + strerror(errno)); + } else + RTE_LOG(DEBUG, EAL, " PCI memory unmapped at %p\n", + requested_addr); +} diff --git a/src/spdk/dpdk/lib/librte_pci/rte_pci.h b/src/spdk/dpdk/lib/librte_pci/rte_pci.h new file mode 100644 index 00000000..eaa9d072 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pci/rte_pci.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2015 Intel Corporation. + * Copyright 2013-2014 6WIND S.A. + */ + +#ifndef _RTE_PCI_H_ +#define _RTE_PCI_H_ + +/** + * @file + * + * RTE PCI Library + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** Formatting string for PCI device identifier: Ex: 0000:00:01.0 */ +#define PCI_PRI_FMT "%.4" PRIx16 ":%.2" PRIx8 ":%.2" PRIx8 ".%" PRIx8 +#define PCI_PRI_STR_SIZE sizeof("XXXXXXXX:XX:XX.X") + +/** Short formatting string, without domain, for PCI device: Ex: 00:01.0 */ +#define PCI_SHORT_PRI_FMT "%.2" PRIx8 ":%.2" PRIx8 ".%" PRIx8 + +/** Nb. of values in PCI device identifier format string. */ +#define PCI_FMT_NVAL 4 + +/** Nb. of values in PCI resource format. */ +#define PCI_RESOURCE_FMT_NVAL 3 + +/** Maximum number of PCI resources. */ +#define PCI_MAX_RESOURCE 6 + +/** + * A structure describing an ID for a PCI driver. Each driver provides a + * table of these IDs for each device that it supports. + */ +struct rte_pci_id { + uint32_t class_id; /**< Class ID or RTE_CLASS_ANY_ID. */ + uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ + uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ + uint16_t subsystem_vendor_id; /**< Subsystem vendor ID or PCI_ANY_ID. */ + uint16_t subsystem_device_id; /**< Subsystem device ID or PCI_ANY_ID. */ +}; + +/** + * A structure describing the location of a PCI device. + */ +struct rte_pci_addr { + uint32_t domain; /**< Device domain */ + uint8_t bus; /**< Device bus */ + uint8_t devid; /**< Device ID */ + uint8_t function; /**< Device function. */ +}; + +/** Any PCI device identifier (vendor, device, ...) */ +#define PCI_ANY_ID (0xffff) +#define RTE_CLASS_ANY_ID (0xffffff) + +/** + * A structure describing a PCI mapping. + */ +struct pci_map { + void *addr; + char *path; + uint64_t offset; + uint64_t size; + uint64_t phaddr; +}; + +struct pci_msix_table { + int bar_index; + uint32_t offset; + uint32_t size; +}; + +/** + * A structure describing a mapped PCI resource. + * For multi-process we need to reproduce all PCI mappings in secondary + * processes, so save them in a tailq. + */ +struct mapped_pci_resource { + TAILQ_ENTRY(mapped_pci_resource) next; + + struct rte_pci_addr pci_addr; + char path[PATH_MAX]; + int nb_maps; + struct pci_map maps[PCI_MAX_RESOURCE]; + struct pci_msix_table msix_table; +}; + + +/** mapped pci device list */ +TAILQ_HEAD(mapped_pci_res_list, mapped_pci_resource); + +/** + * @deprecated + * Utility function to produce a PCI Bus-Device-Function value + * given a string representation. Assumes that the BDF is provided without + * a domain prefix (i.e. domain returned is always 0) + * + * @param input + * The input string to be parsed. Should have the format XX:XX.X + * @param dev_addr + * The PCI Bus-Device-Function address to be returned. + * Domain will always be returned as 0 + * @return + * 0 on success, negative on error. + */ +int eal_parse_pci_BDF(const char *input, struct rte_pci_addr *dev_addr); + +/** + * @deprecated + * Utility function to produce a PCI Bus-Device-Function value + * given a string representation. Assumes that the BDF is provided including + * a domain prefix. + * + * @param input + * The input string to be parsed. Should have the format XXXX:XX:XX.X + * @param dev_addr + * The PCI Bus-Device-Function address to be returned + * @return + * 0 on success, negative on error. + */ +int eal_parse_pci_DomBDF(const char *input, struct rte_pci_addr *dev_addr); + +/** + * Utility function to write a pci device name, this device name can later be + * used to retrieve the corresponding rte_pci_addr using eal_parse_pci_* + * BDF helpers. + * + * @param addr + * The PCI Bus-Device-Function address + * @param output + * The output buffer string + * @param size + * The output buffer size + */ +void rte_pci_device_name(const struct rte_pci_addr *addr, + char *output, size_t size); + +/** + * @deprecated + * Utility function to compare two PCI device addresses. + * + * @param addr + * The PCI Bus-Device-Function address to compare + * @param addr2 + * The PCI Bus-Device-Function address to compare + * @return + * 0 on equal PCI address. + * Positive on addr is greater than addr2. + * Negative on addr is less than addr2, or error. + */ +int rte_eal_compare_pci_addr(const struct rte_pci_addr *addr, + const struct rte_pci_addr *addr2); + +/** + * Utility function to compare two PCI device addresses. + * + * @param addr + * The PCI Bus-Device-Function address to compare + * @param addr2 + * The PCI Bus-Device-Function address to compare + * @return + * 0 on equal PCI address. + * Positive on addr is greater than addr2. + * Negative on addr is less than addr2, or error. + */ +int rte_pci_addr_cmp(const struct rte_pci_addr *addr, + const struct rte_pci_addr *addr2); + + +/** + * Utility function to parse a string into a PCI location. + * + * @param str + * The string to parse + * @param addr + * The reference to the structure where the location + * is stored. + * @return + * 0 on success + * <0 otherwise + */ +int rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr); + +/** + * Map a particular resource from a file. + * + * @param requested_addr + * The starting address for the new mapping range. + * @param fd + * The file descriptor. + * @param offset + * The offset for the mapping range. + * @param size + * The size for the mapping range. + * @param additional_flags + * The additional flags for the mapping range. + * @return + * - On success, the function returns a pointer to the mapped area. + * - On error, the value MAP_FAILED is returned. + */ +void *pci_map_resource(void *requested_addr, int fd, off_t offset, + size_t size, int additional_flags); + +/** + * Unmap a particular resource. + * + * @param requested_addr + * The address for the unmapping range. + * @param size + * The size for the unmapping range. + */ +void pci_unmap_resource(void *requested_addr, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PCI_H_ */ diff --git a/src/spdk/dpdk/lib/librte_pci/rte_pci_version.map b/src/spdk/dpdk/lib/librte_pci/rte_pci_version.map new file mode 100644 index 00000000..c0280277 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pci/rte_pci_version.map @@ -0,0 +1,14 @@ +DPDK_17.11 { + global: + + eal_parse_pci_BDF; + eal_parse_pci_DomBDF; + pci_map_resource; + pci_unmap_resource; + rte_eal_compare_pci_addr; + rte_pci_addr_cmp; + rte_pci_addr_parse; + rte_pci_device_name; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_pdump/Makefile b/src/spdk/dpdk/lib/librte_pdump/Makefile new file mode 100644 index 00000000..0ee0fa1a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pdump/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2016-2018 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_pdump.a + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +CFLAGS += -D_GNU_SOURCE +LDLIBS += -lpthread +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev + +EXPORT_MAP := rte_pdump_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_PDUMP) := rte_pdump.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_PDUMP)-include := rte_pdump.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_pdump/meson.build b/src/spdk/dpdk/lib/librte_pdump/meson.build new file mode 100644 index 00000000..be80904b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pdump/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 2 +sources = files('rte_pdump.c') +headers = files('rte_pdump.h') +allow_experimental_apis = true +deps += ['ethdev'] diff --git a/src/spdk/dpdk/lib/librte_pdump/rte_pdump.c b/src/spdk/dpdk/lib/librte_pdump/rte_pdump.c new file mode 100644 index 00000000..6c3a8858 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pdump/rte_pdump.c @@ -0,0 +1,625 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "rte_pdump.h" + +#define DEVICE_ID_SIZE 64 +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_PDUMP RTE_LOGTYPE_USER1 + +/* Used for the multi-process communication */ +#define PDUMP_MP "mp_pdump" + +enum pdump_operation { + DISABLE = 1, + ENABLE = 2 +}; + +enum pdump_version { + V1 = 1 +}; + +struct pdump_request { + uint16_t ver; + uint16_t op; + uint32_t flags; + union pdump_data { + struct enable_v1 { + char device[DEVICE_ID_SIZE]; + uint16_t queue; + struct rte_ring *ring; + struct rte_mempool *mp; + void *filter; + } en_v1; + struct disable_v1 { + char device[DEVICE_ID_SIZE]; + uint16_t queue; + struct rte_ring *ring; + struct rte_mempool *mp; + void *filter; + } dis_v1; + } data; +}; + +struct pdump_response { + uint16_t ver; + uint16_t res_op; + int32_t err_value; +}; + +static struct pdump_rxtx_cbs { + struct rte_ring *ring; + struct rte_mempool *mp; + const struct rte_eth_rxtx_callback *cb; + void *filter; +} rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT], +tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT]; + +static inline int +pdump_pktmbuf_copy_data(struct rte_mbuf *seg, const struct rte_mbuf *m) +{ + if (rte_pktmbuf_tailroom(seg) < m->data_len) { + RTE_LOG(ERR, PDUMP, + "User mempool: insufficient data_len of mbuf\n"); + return -EINVAL; + } + + seg->port = m->port; + seg->vlan_tci = m->vlan_tci; + seg->hash = m->hash; + seg->tx_offload = m->tx_offload; + seg->ol_flags = m->ol_flags; + seg->packet_type = m->packet_type; + seg->vlan_tci_outer = m->vlan_tci_outer; + seg->data_len = m->data_len; + seg->pkt_len = seg->data_len; + rte_memcpy(rte_pktmbuf_mtod(seg, void *), + rte_pktmbuf_mtod(m, void *), + rte_pktmbuf_data_len(seg)); + + return 0; +} + +static inline struct rte_mbuf * +pdump_pktmbuf_copy(struct rte_mbuf *m, struct rte_mempool *mp) +{ + struct rte_mbuf *m_dup, *seg, **prev; + uint32_t pktlen; + uint16_t nseg; + + m_dup = rte_pktmbuf_alloc(mp); + if (unlikely(m_dup == NULL)) + return NULL; + + seg = m_dup; + prev = &seg->next; + pktlen = m->pkt_len; + nseg = 0; + + do { + nseg++; + if (pdump_pktmbuf_copy_data(seg, m) < 0) { + if (seg != m_dup) + rte_pktmbuf_free_seg(seg); + rte_pktmbuf_free(m_dup); + return NULL; + } + *prev = seg; + prev = &seg->next; + } while ((m = m->next) != NULL && + (seg = rte_pktmbuf_alloc(mp)) != NULL); + + *prev = NULL; + m_dup->nb_segs = nseg; + m_dup->pkt_len = pktlen; + + /* Allocation of new indirect segment failed */ + if (unlikely(seg == NULL)) { + rte_pktmbuf_free(m_dup); + return NULL; + } + + __rte_mbuf_sanity_check(m_dup, 1); + return m_dup; +} + +static inline void +pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) +{ + unsigned i; + int ring_enq; + uint16_t d_pkts = 0; + struct rte_mbuf *dup_bufs[nb_pkts]; + struct pdump_rxtx_cbs *cbs; + struct rte_ring *ring; + struct rte_mempool *mp; + struct rte_mbuf *p; + + cbs = user_params; + ring = cbs->ring; + mp = cbs->mp; + for (i = 0; i < nb_pkts; i++) { + p = pdump_pktmbuf_copy(pkts[i], mp); + if (p) + dup_bufs[d_pkts++] = p; + } + + ring_enq = rte_ring_enqueue_burst(ring, (void *)dup_bufs, d_pkts, NULL); + if (unlikely(ring_enq < d_pkts)) { + RTE_LOG(DEBUG, PDUMP, + "only %d of packets enqueued to ring\n", ring_enq); + do { + rte_pktmbuf_free(dup_bufs[ring_enq]); + } while (++ring_enq < d_pkts); + } +} + +static uint16_t +pdump_rx(uint16_t port __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts, uint16_t nb_pkts, + uint16_t max_pkts __rte_unused, + void *user_params) +{ + pdump_copy(pkts, nb_pkts, user_params); + return nb_pkts; +} + +static uint16_t +pdump_tx(uint16_t port __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) +{ + pdump_copy(pkts, nb_pkts, user_params); + return nb_pkts; +} + +static int +pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, + struct rte_ring *ring, struct rte_mempool *mp, + uint16_t operation) +{ + uint16_t qid; + struct pdump_rxtx_cbs *cbs = NULL; + + qid = (queue == RTE_PDUMP_ALL_QUEUES) ? 0 : queue; + for (; qid < end_q; qid++) { + cbs = &rx_cbs[port][qid]; + if (cbs && operation == ENABLE) { + if (cbs->cb) { + RTE_LOG(ERR, PDUMP, + "failed to add rx callback for port=%d " + "and queue=%d, callback already exists\n", + port, qid); + return -EEXIST; + } + cbs->ring = ring; + cbs->mp = mp; + cbs->cb = rte_eth_add_first_rx_callback(port, qid, + pdump_rx, cbs); + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to add rx callback, errno=%d\n", + rte_errno); + return rte_errno; + } + } + if (cbs && operation == DISABLE) { + int ret; + + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to delete non existing rx " + "callback for port=%d and queue=%d\n", + port, qid); + return -EINVAL; + } + ret = rte_eth_remove_rx_callback(port, qid, cbs->cb); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to remove rx callback, errno=%d\n", + -ret); + return ret; + } + cbs->cb = NULL; + } + } + + return 0; +} + +static int +pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, + struct rte_ring *ring, struct rte_mempool *mp, + uint16_t operation) +{ + + uint16_t qid; + struct pdump_rxtx_cbs *cbs = NULL; + + qid = (queue == RTE_PDUMP_ALL_QUEUES) ? 0 : queue; + for (; qid < end_q; qid++) { + cbs = &tx_cbs[port][qid]; + if (cbs && operation == ENABLE) { + if (cbs->cb) { + RTE_LOG(ERR, PDUMP, + "failed to add tx callback for port=%d " + "and queue=%d, callback already exists\n", + port, qid); + return -EEXIST; + } + cbs->ring = ring; + cbs->mp = mp; + cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx, + cbs); + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to add tx callback, errno=%d\n", + rte_errno); + return rte_errno; + } + } + if (cbs && operation == DISABLE) { + int ret; + + if (cbs->cb == NULL) { + RTE_LOG(ERR, PDUMP, + "failed to delete non existing tx " + "callback for port=%d and queue=%d\n", + port, qid); + return -EINVAL; + } + ret = rte_eth_remove_tx_callback(port, qid, cbs->cb); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to remove tx callback, errno=%d\n", + -ret); + return ret; + } + cbs->cb = NULL; + } + } + + return 0; +} + +static int +set_pdump_rxtx_cbs(const struct pdump_request *p) +{ + uint16_t nb_rx_q = 0, nb_tx_q = 0, end_q, queue; + uint16_t port; + int ret = 0; + uint32_t flags; + uint16_t operation; + struct rte_ring *ring; + struct rte_mempool *mp; + + flags = p->flags; + operation = p->op; + if (operation == ENABLE) { + ret = rte_eth_dev_get_port_by_name(p->data.en_v1.device, + &port); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to get port id for device id=%s\n", + p->data.en_v1.device); + return -EINVAL; + } + queue = p->data.en_v1.queue; + ring = p->data.en_v1.ring; + mp = p->data.en_v1.mp; + } else { + ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device, + &port); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "failed to get port id for device id=%s\n", + p->data.dis_v1.device); + return -EINVAL; + } + queue = p->data.dis_v1.queue; + ring = p->data.dis_v1.ring; + mp = p->data.dis_v1.mp; + } + + /* validation if packet capture is for all queues */ + if (queue == RTE_PDUMP_ALL_QUEUES) { + struct rte_eth_dev_info dev_info; + + rte_eth_dev_info_get(port, &dev_info); + nb_rx_q = dev_info.nb_rx_queues; + nb_tx_q = dev_info.nb_tx_queues; + if (nb_rx_q == 0 && flags & RTE_PDUMP_FLAG_RX) { + RTE_LOG(ERR, PDUMP, + "number of rx queues cannot be 0\n"); + return -EINVAL; + } + if (nb_tx_q == 0 && flags & RTE_PDUMP_FLAG_TX) { + RTE_LOG(ERR, PDUMP, + "number of tx queues cannot be 0\n"); + return -EINVAL; + } + if ((nb_tx_q == 0 || nb_rx_q == 0) && + flags == RTE_PDUMP_FLAG_RXTX) { + RTE_LOG(ERR, PDUMP, + "both tx&rx queues must be non zero\n"); + return -EINVAL; + } + } + + /* register RX callback */ + if (flags & RTE_PDUMP_FLAG_RX) { + end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1; + ret = pdump_register_rx_callbacks(end_q, port, queue, ring, mp, + operation); + if (ret < 0) + return ret; + } + + /* register TX callback */ + if (flags & RTE_PDUMP_FLAG_TX) { + end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1; + ret = pdump_register_tx_callbacks(end_q, port, queue, ring, mp, + operation); + if (ret < 0) + return ret; + } + + return ret; +} + +static int +pdump_server(const struct rte_mp_msg *mp_msg, const void *peer) +{ + struct rte_mp_msg mp_resp; + const struct pdump_request *cli_req; + struct pdump_response *resp = (struct pdump_response *)&mp_resp.param; + + /* recv client requests */ + if (mp_msg->len_param != sizeof(*cli_req)) { + RTE_LOG(ERR, PDUMP, "failed to recv from client\n"); + resp->err_value = -EINVAL; + } else { + cli_req = (const struct pdump_request *)mp_msg->param; + resp->ver = cli_req->ver; + resp->res_op = cli_req->op; + resp->err_value = set_pdump_rxtx_cbs(cli_req); + } + + strlcpy(mp_resp.name, PDUMP_MP, RTE_MP_MAX_NAME_LEN); + mp_resp.len_param = sizeof(*resp); + mp_resp.num_fds = 0; + if (rte_mp_reply(&mp_resp, peer) < 0) { + RTE_LOG(ERR, PDUMP, "failed to send to client:%s, %s:%d\n", + strerror(rte_errno), __func__, __LINE__); + return -1; + } + + return 0; +} + +int +rte_pdump_init(const char *path __rte_unused) +{ + return rte_mp_action_register(PDUMP_MP, pdump_server); +} + +int +rte_pdump_uninit(void) +{ + rte_mp_action_unregister(PDUMP_MP); + + return 0; +} + +static int +pdump_validate_ring_mp(struct rte_ring *ring, struct rte_mempool *mp) +{ + if (ring == NULL || mp == NULL) { + RTE_LOG(ERR, PDUMP, "NULL ring or mempool are passed %s:%d\n", + __func__, __LINE__); + rte_errno = EINVAL; + return -1; + } + if (mp->flags & MEMPOOL_F_SP_PUT || mp->flags & MEMPOOL_F_SC_GET) { + RTE_LOG(ERR, PDUMP, "mempool with either SP or SC settings" + " is not valid for pdump, should have MP and MC settings\n"); + rte_errno = EINVAL; + return -1; + } + if (ring->prod.single || ring->cons.single) { + RTE_LOG(ERR, PDUMP, "ring with either SP or SC settings" + " is not valid for pdump, should have MP and MC settings\n"); + rte_errno = EINVAL; + return -1; + } + + return 0; +} + +static int +pdump_validate_flags(uint32_t flags) +{ + if (flags != RTE_PDUMP_FLAG_RX && flags != RTE_PDUMP_FLAG_TX && + flags != RTE_PDUMP_FLAG_RXTX) { + RTE_LOG(ERR, PDUMP, + "invalid flags, should be either rx/tx/rxtx\n"); + rte_errno = EINVAL; + return -1; + } + + return 0; +} + +static int +pdump_validate_port(uint16_t port, char *name) +{ + int ret = 0; + + if (port >= RTE_MAX_ETHPORTS) { + RTE_LOG(ERR, PDUMP, "Invalid port id %u, %s:%d\n", port, + __func__, __LINE__); + rte_errno = EINVAL; + return -1; + } + + ret = rte_eth_dev_get_name_by_port(port, name); + if (ret < 0) { + RTE_LOG(ERR, PDUMP, + "port id to name mapping failed for port id=%u, %s:%d\n", + port, __func__, __LINE__); + rte_errno = EINVAL; + return -1; + } + + return 0; +} + +static int +pdump_prepare_client_request(char *device, uint16_t queue, + uint32_t flags, + uint16_t operation, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter) +{ + int ret = -1; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct pdump_request *req = (struct pdump_request *)mp_req.param; + struct pdump_response *resp; + + req->ver = 1; + req->flags = flags; + req->op = operation; + if ((operation & ENABLE) != 0) { + snprintf(req->data.en_v1.device, + sizeof(req->data.en_v1.device), "%s", device); + req->data.en_v1.queue = queue; + req->data.en_v1.ring = ring; + req->data.en_v1.mp = mp; + req->data.en_v1.filter = filter; + } else { + snprintf(req->data.dis_v1.device, + sizeof(req->data.dis_v1.device), "%s", device); + req->data.dis_v1.queue = queue; + req->data.dis_v1.ring = NULL; + req->data.dis_v1.mp = NULL; + req->data.dis_v1.filter = NULL; + } + + strlcpy(mp_req.name, PDUMP_MP, RTE_MP_MAX_NAME_LEN); + mp_req.len_param = sizeof(*req); + mp_req.num_fds = 0; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0) { + mp_rep = &mp_reply.msgs[0]; + resp = (struct pdump_response *)mp_rep->param; + rte_errno = resp->err_value; + if (!resp->err_value) + ret = 0; + free(mp_reply.msgs); + } + + if (ret < 0) + RTE_LOG(ERR, PDUMP, + "client request for pdump enable/disable failed\n"); + return ret; +} + +int +rte_pdump_enable(uint16_t port, uint16_t queue, uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter) +{ + + int ret = 0; + char name[DEVICE_ID_SIZE]; + + ret = pdump_validate_port(port, name); + if (ret < 0) + return ret; + ret = pdump_validate_ring_mp(ring, mp); + if (ret < 0) + return ret; + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + ret = pdump_prepare_client_request(name, queue, flags, + ENABLE, ring, mp, filter); + + return ret; +} + +int +rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter) +{ + int ret = 0; + + ret = pdump_validate_ring_mp(ring, mp); + if (ret < 0) + return ret; + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + ret = pdump_prepare_client_request(device_id, queue, flags, + ENABLE, ring, mp, filter); + + return ret; +} + +int +rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags) +{ + int ret = 0; + char name[DEVICE_ID_SIZE]; + + ret = pdump_validate_port(port, name); + if (ret < 0) + return ret; + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + ret = pdump_prepare_client_request(name, queue, flags, + DISABLE, NULL, NULL, NULL); + + return ret; +} + +int +rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags) +{ + int ret = 0; + + ret = pdump_validate_flags(flags); + if (ret < 0) + return ret; + + ret = pdump_prepare_client_request(device_id, queue, flags, + DISABLE, NULL, NULL, NULL); + + return ret; +} + +int +rte_pdump_set_socket_dir(const char *path __rte_unused, + enum rte_pdump_socktype type __rte_unused) +{ + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_pdump/rte_pdump.h b/src/spdk/dpdk/lib/librte_pdump/rte_pdump.h new file mode 100644 index 00000000..673a2b07 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pdump/rte_pdump.h @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Intel Corporation + */ + +#ifndef _RTE_PDUMP_H_ +#define _RTE_PDUMP_H_ + +/** + * @file + * RTE pdump + * + * packet dump library to provide packet capturing support on dpdk. + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_PDUMP_ALL_QUEUES UINT16_MAX + +enum { + RTE_PDUMP_FLAG_RX = 1, /* receive direction */ + RTE_PDUMP_FLAG_TX = 2, /* transmit direction */ + /* both receive and transmit directions */ + RTE_PDUMP_FLAG_RXTX = (RTE_PDUMP_FLAG_RX|RTE_PDUMP_FLAG_TX) +}; + +enum rte_pdump_socktype { + RTE_PDUMP_SOCKET_SERVER = 1, + RTE_PDUMP_SOCKET_CLIENT = 2 +}; + +/** + * Initialize packet capturing handling + * + * Register the IPC action for communication with target (primary) process. + * + * @param path + * This parameter is going to be deprecated; it was used for specifying the + * directory path for server socket. + * + * @return + * 0 on success, -1 on error + */ +int +rte_pdump_init(const char *path); + +/** + * Un initialize packet capturing handling + * + * Unregister the IPC action for communication with target (primary) process. + * + * @return + * 0 on success, -1 on error + */ +int +rte_pdump_uninit(void); + +/** + * Enables packet capturing on given port and queue. + * + * @param port + * port on which packet capturing should be enabled. + * @param queue + * queue of a given port on which packet capturing should be enabled. + * users should pass on value UINT16_MAX to enable packet capturing on all + * queues of a given port. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * @param ring + * ring on which captured packets will be enqueued for user. + * @param mp + * mempool on to which original packets will be mirrored or duplicated. + * @param filter + * place holder for packet filtering. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ + +int +rte_pdump_enable(uint16_t port, uint16_t queue, uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter); + +/** + * Disables packet capturing on given port and queue. + * + * @param port + * port on which packet capturing should be disabled. + * @param queue + * queue of a given port on which packet capturing should be disabled. + * users should pass on value UINT16_MAX to disable packet capturing on all + * queues of a given port. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ + +int +rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags); + +/** + * Enables packet capturing on given device id and queue. + * device_id can be name or pci address of device. + * + * @param device_id + * device id on which packet capturing should be enabled. + * @param queue + * queue of a given device id on which packet capturing should be enabled. + * users should pass on value UINT16_MAX to enable packet capturing on all + * queues of a given device id. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * @param ring + * ring on which captured packets will be enqueued for user. + * @param mp + * mempool on to which original packets will be mirrored or duplicated. + * @param filter + * place holder for packet filtering. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ + +int +rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags, + struct rte_ring *ring, + struct rte_mempool *mp, + void *filter); + +/** + * Disables packet capturing on given device_id and queue. + * device_id can be name or pci address of device. + * + * @param device_id + * pci address or name of the device on which packet capturing + * should be disabled. + * @param queue + * queue of a given device on which packet capturing should be disabled. + * users should pass on value UINT16_MAX to disable packet capturing on all + * queues of a given device id. + * @param flags + * flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX + * on which packet capturing should be enabled for a given port and queue. + * + * @return + * 0 on success, -1 on error, rte_errno is set accordingly. + */ +int +rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue, + uint32_t flags); + +/** + * @deprecated + * Allows applications to set server and client socket paths. + * If specified path is null default path will be selected, i.e. + *"/var/run/" for root user and "$HOME" for non root user. + * Clients also need to call this API to set their server path if the + * server path is different from default path. + * This API is not thread-safe. + * + * @param path + * directory path for server or client socket. + * @param type + * specifies RTE_PDUMP_SOCKET_SERVER if socket path is for server. + * (or) + * specifies RTE_PDUMP_SOCKET_CLIENT if socket path is for client. + * + * @return + * 0 on success, -EINVAL on error + * + */ +__rte_deprecated int +rte_pdump_set_socket_dir(const char *path, enum rte_pdump_socktype type); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_PDUMP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_pdump/rte_pdump_version.map b/src/spdk/dpdk/lib/librte_pdump/rte_pdump_version.map new file mode 100644 index 00000000..edec99a4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pdump/rte_pdump_version.map @@ -0,0 +1,13 @@ +DPDK_16.07 { + global: + + rte_pdump_disable; + rte_pdump_disable_by_deviceid; + rte_pdump_enable; + rte_pdump_enable_by_deviceid; + rte_pdump_init; + rte_pdump_set_socket_dir; + rte_pdump_uninit; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_pipeline/Makefile b/src/spdk/dpdk/lib/librte_pipeline/Makefile new file mode 100644 index 00000000..84afe98c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2016 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pipeline.a + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_table +LDLIBS += -lrte_port -lrte_meter -lrte_sched + +EXPORT_MAP := rte_pipeline_version.map + +LIBABIVER := 3 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_PIPELINE) := rte_pipeline.c +SRCS-$(CONFIG_RTE_LIBRTE_PIPELINE) += rte_port_in_action.c +SRCS-$(CONFIG_RTE_LIBRTE_PIPELINE) += rte_table_action.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_PIPELINE)-include += rte_pipeline.h rte_port_in_action.h rte_table_action.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_pipeline/meson.build b/src/spdk/dpdk/lib/librte_pipeline/meson.build new file mode 100644 index 00000000..dc16ab42 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 3 +allow_experimental_apis = true +sources = files('rte_pipeline.c', 'rte_port_in_action.c', 'rte_table_action.c') +headers = files('rte_pipeline.h', 'rte_port_in_action.h', 'rte_table_action.h') +deps += ['port', 'table', 'meter', 'sched'] diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.c b/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.c new file mode 100644 index 00000000..0cb8b804 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.c @@ -0,0 +1,1619 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_pipeline.h" + +#define RTE_TABLE_INVALID UINT32_MAX + +#ifdef RTE_PIPELINE_STATS_COLLECT + +#define RTE_PIPELINE_STATS_AH_DROP_WRITE(p, mask) \ + ({ (p)->n_pkts_ah_drop = __builtin_popcountll(mask); }) + +#define RTE_PIPELINE_STATS_AH_DROP_READ(p, counter) \ + ({ (counter) += (p)->n_pkts_ah_drop; (p)->n_pkts_ah_drop = 0; }) + +#define RTE_PIPELINE_STATS_TABLE_DROP0(p) \ + ({ (p)->pkts_drop_mask = (p)->action_mask0[RTE_PIPELINE_ACTION_DROP]; }) + +#define RTE_PIPELINE_STATS_TABLE_DROP1(p, counter) \ +({ \ + uint64_t mask = (p)->action_mask0[RTE_PIPELINE_ACTION_DROP]; \ + mask ^= (p)->pkts_drop_mask; \ + (counter) += __builtin_popcountll(mask); \ +}) + +#else + +#define RTE_PIPELINE_STATS_AH_DROP_WRITE(p, mask) +#define RTE_PIPELINE_STATS_AH_DROP_READ(p, counter) +#define RTE_PIPELINE_STATS_TABLE_DROP0(p) +#define RTE_PIPELINE_STATS_TABLE_DROP1(p, counter) + +#endif + +struct rte_port_in { + /* Input parameters */ + struct rte_port_in_ops ops; + rte_pipeline_port_in_action_handler f_action; + void *arg_ah; + uint32_t burst_size; + + /* The table to which this port is connected */ + uint32_t table_id; + + /* Handle to low-level port */ + void *h_port; + + /* List of enabled ports */ + struct rte_port_in *next; + + /* Statistics */ + uint64_t n_pkts_dropped_by_ah; +}; + +struct rte_port_out { + /* Input parameters */ + struct rte_port_out_ops ops; + rte_pipeline_port_out_action_handler f_action; + void *arg_ah; + + /* Handle to low-level port */ + void *h_port; + + /* Statistics */ + uint64_t n_pkts_dropped_by_ah; +}; + +struct rte_table { + /* Input parameters */ + struct rte_table_ops ops; + rte_pipeline_table_action_handler_hit f_action_hit; + rte_pipeline_table_action_handler_miss f_action_miss; + void *arg_ah; + struct rte_pipeline_table_entry *default_entry; + uint32_t entry_size; + + uint32_t table_next_id; + uint32_t table_next_id_valid; + + /* Handle to the low-level table object */ + void *h_table; + + /* Statistics */ + uint64_t n_pkts_dropped_by_lkp_hit_ah; + uint64_t n_pkts_dropped_by_lkp_miss_ah; + uint64_t n_pkts_dropped_lkp_hit; + uint64_t n_pkts_dropped_lkp_miss; +}; + +#define RTE_PIPELINE_MAX_NAME_SZ 124 + +struct rte_pipeline { + /* Input parameters */ + char name[RTE_PIPELINE_MAX_NAME_SZ]; + int socket_id; + uint32_t offset_port_id; + + /* Internal tables */ + struct rte_port_in ports_in[RTE_PIPELINE_PORT_IN_MAX]; + struct rte_port_out ports_out[RTE_PIPELINE_PORT_OUT_MAX]; + struct rte_table tables[RTE_PIPELINE_TABLE_MAX]; + + /* Occupancy of internal tables */ + uint32_t num_ports_in; + uint32_t num_ports_out; + uint32_t num_tables; + + /* List of enabled ports */ + uint64_t enabled_port_in_mask; + struct rte_port_in *port_in_next; + + /* Pipeline run structures */ + struct rte_mbuf *pkts[RTE_PORT_IN_BURST_SIZE_MAX]; + struct rte_pipeline_table_entry *entries[RTE_PORT_IN_BURST_SIZE_MAX]; + uint64_t action_mask0[RTE_PIPELINE_ACTIONS]; + uint64_t action_mask1[RTE_PIPELINE_ACTIONS]; + uint64_t pkts_mask; + uint64_t n_pkts_ah_drop; + uint64_t pkts_drop_mask; +} __rte_cache_aligned; + +static inline uint32_t +rte_mask_get_next(uint64_t mask, uint32_t pos) +{ + uint64_t mask_rot = (mask << ((63 - pos) & 0x3F)) | + (mask >> ((pos + 1) & 0x3F)); + return (__builtin_ctzll(mask_rot) - (63 - pos)) & 0x3F; +} + +static inline uint32_t +rte_mask_get_prev(uint64_t mask, uint32_t pos) +{ + uint64_t mask_rot = (mask >> (pos & 0x3F)) | + (mask << ((64 - pos) & 0x3F)); + return ((63 - __builtin_clzll(mask_rot)) + pos) & 0x3F; +} + +static void +rte_pipeline_table_free(struct rte_table *table); + +static void +rte_pipeline_port_in_free(struct rte_port_in *port); + +static void +rte_pipeline_port_out_free(struct rte_port_out *port); + +/* + * Pipeline + * + */ +static int +rte_pipeline_check_params(struct rte_pipeline_params *params) +{ + if (params == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: Incorrect value for parameter params\n", __func__); + return -EINVAL; + } + + /* name */ + if (params->name == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: Incorrect value for parameter name\n", __func__); + return -EINVAL; + } + + /* socket */ + if ((params->socket_id < 0) || + (params->socket_id >= RTE_MAX_NUMA_NODES)) { + RTE_LOG(ERR, PIPELINE, + "%s: Incorrect value for parameter socket_id\n", + __func__); + return -EINVAL; + } + + return 0; +} + +struct rte_pipeline * +rte_pipeline_create(struct rte_pipeline_params *params) +{ + struct rte_pipeline *p; + int status; + + /* Check input parameters */ + status = rte_pipeline_check_params(params); + if (status != 0) { + RTE_LOG(ERR, PIPELINE, + "%s: Pipeline params check failed (%d)\n", + __func__, status); + return NULL; + } + + /* Allocate memory for the pipeline on requested socket */ + p = rte_zmalloc_socket("PIPELINE", sizeof(struct rte_pipeline), + RTE_CACHE_LINE_SIZE, params->socket_id); + + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: Pipeline memory allocation failed\n", __func__); + return NULL; + } + + /* Save input parameters */ + snprintf(p->name, RTE_PIPELINE_MAX_NAME_SZ, "%s", params->name); + p->socket_id = params->socket_id; + p->offset_port_id = params->offset_port_id; + + /* Initialize pipeline internal data structure */ + p->num_ports_in = 0; + p->num_ports_out = 0; + p->num_tables = 0; + p->enabled_port_in_mask = 0; + p->port_in_next = NULL; + p->pkts_mask = 0; + p->n_pkts_ah_drop = 0; + + return p; +} + +int +rte_pipeline_free(struct rte_pipeline *p) +{ + uint32_t i; + + /* Check input parameters */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: rte_pipeline parameter is NULL\n", __func__); + return -EINVAL; + } + + /* Free input ports */ + for (i = 0; i < p->num_ports_in; i++) { + struct rte_port_in *port = &p->ports_in[i]; + + rte_pipeline_port_in_free(port); + } + + /* Free tables */ + for (i = 0; i < p->num_tables; i++) { + struct rte_table *table = &p->tables[i]; + + rte_pipeline_table_free(table); + } + + /* Free output ports */ + for (i = 0; i < p->num_ports_out; i++) { + struct rte_port_out *port = &p->ports_out[i]; + + rte_pipeline_port_out_free(port); + } + + /* Free pipeline memory */ + rte_free(p); + + return 0; +} + +/* + * Table + * + */ +static int +rte_table_check_params(struct rte_pipeline *p, + struct rte_pipeline_table_params *params, + uint32_t *table_id) +{ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter is NULL\n", + __func__); + return -EINVAL; + } + if (params == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: params parameter is NULL\n", + __func__); + return -EINVAL; + } + if (table_id == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: table_id parameter is NULL\n", + __func__); + return -EINVAL; + } + + /* ops */ + if (params->ops == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: params->ops is NULL\n", + __func__); + return -EINVAL; + } + + if (params->ops->f_create == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_create function pointer is NULL\n", __func__); + return -EINVAL; + } + + if (params->ops->f_lookup == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_lookup function pointer is NULL\n", __func__); + return -EINVAL; + } + + /* De we have room for one more table? */ + if (p->num_tables == RTE_PIPELINE_TABLE_MAX) { + RTE_LOG(ERR, PIPELINE, + "%s: Incorrect value for num_tables parameter\n", + __func__); + return -EINVAL; + } + + return 0; +} + +int +rte_pipeline_table_create(struct rte_pipeline *p, + struct rte_pipeline_table_params *params, + uint32_t *table_id) +{ + struct rte_table *table; + struct rte_pipeline_table_entry *default_entry; + void *h_table; + uint32_t entry_size, id; + int status; + + /* Check input arguments */ + status = rte_table_check_params(p, params, table_id); + if (status != 0) + return status; + + id = p->num_tables; + table = &p->tables[id]; + + /* Allocate space for the default table entry */ + entry_size = sizeof(struct rte_pipeline_table_entry) + + params->action_data_size; + default_entry = rte_zmalloc_socket( + "PIPELINE", entry_size, RTE_CACHE_LINE_SIZE, p->socket_id); + if (default_entry == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: Failed to allocate default entry\n", __func__); + return -EINVAL; + } + + /* Create the table */ + h_table = params->ops->f_create(params->arg_create, p->socket_id, + entry_size); + if (h_table == NULL) { + rte_free(default_entry); + RTE_LOG(ERR, PIPELINE, "%s: Table creation failed\n", __func__); + return -EINVAL; + } + + /* Commit current table to the pipeline */ + p->num_tables++; + *table_id = id; + + /* Save input parameters */ + memcpy(&table->ops, params->ops, sizeof(struct rte_table_ops)); + table->f_action_hit = params->f_action_hit; + table->f_action_miss = params->f_action_miss; + table->arg_ah = params->arg_ah; + table->entry_size = entry_size; + + /* Clear the lookup miss actions (to be set later through API) */ + table->default_entry = default_entry; + table->default_entry->action = RTE_PIPELINE_ACTION_DROP; + + /* Initialize table internal data structure */ + table->h_table = h_table; + table->table_next_id = 0; + table->table_next_id_valid = 0; + + return 0; +} + +void +rte_pipeline_table_free(struct rte_table *table) +{ + if (table->ops.f_free != NULL) + table->ops.f_free(table->h_table); + + rte_free(table->default_entry); +} + +int +rte_pipeline_table_default_entry_add(struct rte_pipeline *p, + uint32_t table_id, + struct rte_pipeline_table_entry *default_entry, + struct rte_pipeline_table_entry **default_entry_ptr) +{ + struct rte_table *table; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (default_entry == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: default_entry parameter is NULL\n", __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table_id %d out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + + if ((default_entry->action == RTE_PIPELINE_ACTION_TABLE) && + table->table_next_id_valid && + (default_entry->table_id != table->table_next_id)) { + RTE_LOG(ERR, PIPELINE, + "%s: Tree-like topologies not allowed\n", __func__); + return -EINVAL; + } + + /* Set the lookup miss actions */ + if ((default_entry->action == RTE_PIPELINE_ACTION_TABLE) && + (table->table_next_id_valid == 0)) { + table->table_next_id = default_entry->table_id; + table->table_next_id_valid = 1; + } + + memcpy(table->default_entry, default_entry, table->entry_size); + + *default_entry_ptr = table->default_entry; + return 0; +} + +int +rte_pipeline_table_default_entry_delete(struct rte_pipeline *p, + uint32_t table_id, + struct rte_pipeline_table_entry *entry) +{ + struct rte_table *table; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: pipeline parameter is NULL\n", __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table_id %d out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + + /* Save the current contents of the default entry */ + if (entry) + memcpy(entry, table->default_entry, table->entry_size); + + /* Clear the lookup miss actions */ + memset(table->default_entry, 0, table->entry_size); + table->default_entry->action = RTE_PIPELINE_ACTION_DROP; + + return 0; +} + +int +rte_pipeline_table_entry_add(struct rte_pipeline *p, + uint32_t table_id, + void *key, + struct rte_pipeline_table_entry *entry, + int *key_found, + struct rte_pipeline_table_entry **entry_ptr) +{ + struct rte_table *table; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (key == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: key parameter is NULL\n", __func__); + return -EINVAL; + } + + if (entry == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: entry parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table_id %d out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + + if (table->ops.f_add == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: f_add function pointer NULL\n", + __func__); + return -EINVAL; + } + + if ((entry->action == RTE_PIPELINE_ACTION_TABLE) && + table->table_next_id_valid && + (entry->table_id != table->table_next_id)) { + RTE_LOG(ERR, PIPELINE, + "%s: Tree-like topologies not allowed\n", __func__); + return -EINVAL; + } + + /* Add entry */ + if ((entry->action == RTE_PIPELINE_ACTION_TABLE) && + (table->table_next_id_valid == 0)) { + table->table_next_id = entry->table_id; + table->table_next_id_valid = 1; + } + + return (table->ops.f_add)(table->h_table, key, (void *) entry, + key_found, (void **) entry_ptr); +} + +int +rte_pipeline_table_entry_delete(struct rte_pipeline *p, + uint32_t table_id, + void *key, + int *key_found, + struct rte_pipeline_table_entry *entry) +{ + struct rte_table *table; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (key == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: key parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table_id %d out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + + if (table->ops.f_delete == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_delete function pointer NULL\n", __func__); + return -EINVAL; + } + + return (table->ops.f_delete)(table->h_table, key, key_found, entry); +} + +int rte_pipeline_table_entry_add_bulk(struct rte_pipeline *p, + uint32_t table_id, + void **keys, + struct rte_pipeline_table_entry **entries, + uint32_t n_keys, + int *key_found, + struct rte_pipeline_table_entry **entries_ptr) +{ + struct rte_table *table; + uint32_t i; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (keys == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: keys parameter is NULL\n", __func__); + return -EINVAL; + } + + if (entries == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: entries parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table_id %d out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + + if (table->ops.f_add_bulk == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: f_add_bulk function pointer NULL\n", + __func__); + return -EINVAL; + } + + for (i = 0; i < n_keys; i++) { + if ((entries[i]->action == RTE_PIPELINE_ACTION_TABLE) && + table->table_next_id_valid && + (entries[i]->table_id != table->table_next_id)) { + RTE_LOG(ERR, PIPELINE, + "%s: Tree-like topologies not allowed\n", __func__); + return -EINVAL; + } + } + + /* Add entry */ + for (i = 0; i < n_keys; i++) { + if ((entries[i]->action == RTE_PIPELINE_ACTION_TABLE) && + (table->table_next_id_valid == 0)) { + table->table_next_id = entries[i]->table_id; + table->table_next_id_valid = 1; + } + } + + return (table->ops.f_add_bulk)(table->h_table, keys, (void **) entries, + n_keys, key_found, (void **) entries_ptr); +} + +int rte_pipeline_table_entry_delete_bulk(struct rte_pipeline *p, + uint32_t table_id, + void **keys, + uint32_t n_keys, + int *key_found, + struct rte_pipeline_table_entry **entries) +{ + struct rte_table *table; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (keys == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: key parameter is NULL\n", + __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table_id %d out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + + if (table->ops.f_delete_bulk == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_delete function pointer NULL\n", __func__); + return -EINVAL; + } + + return (table->ops.f_delete_bulk)(table->h_table, keys, n_keys, key_found, + (void **) entries); +} + +/* + * Port + * + */ +static int +rte_pipeline_port_in_check_params(struct rte_pipeline *p, + struct rte_pipeline_port_in_params *params, + uint32_t *port_id) +{ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + if (params == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: params parameter NULL\n", __func__); + return -EINVAL; + } + if (port_id == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: port_id parameter NULL\n", + __func__); + return -EINVAL; + } + + /* ops */ + if (params->ops == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: params->ops parameter NULL\n", + __func__); + return -EINVAL; + } + + if (params->ops->f_create == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_create function pointer NULL\n", __func__); + return -EINVAL; + } + + if (params->ops->f_rx == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: f_rx function pointer NULL\n", + __func__); + return -EINVAL; + } + + /* burst_size */ + if ((params->burst_size == 0) || + (params->burst_size > RTE_PORT_IN_BURST_SIZE_MAX)) { + RTE_LOG(ERR, PIPELINE, "%s: invalid value for burst_size\n", + __func__); + return -EINVAL; + } + + /* Do we have room for one more port? */ + if (p->num_ports_in == RTE_PIPELINE_PORT_IN_MAX) { + RTE_LOG(ERR, PIPELINE, + "%s: invalid value for num_ports_in\n", __func__); + return -EINVAL; + } + + return 0; +} + +static int +rte_pipeline_port_out_check_params(struct rte_pipeline *p, + struct rte_pipeline_port_out_params *params, + uint32_t *port_id) +{ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (params == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: params parameter NULL\n", __func__); + return -EINVAL; + } + + if (port_id == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: port_id parameter NULL\n", + __func__); + return -EINVAL; + } + + /* ops */ + if (params->ops == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: params->ops parameter NULL\n", + __func__); + return -EINVAL; + } + + if (params->ops->f_create == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_create function pointer NULL\n", __func__); + return -EINVAL; + } + + if (params->ops->f_tx == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_tx function pointer NULL\n", __func__); + return -EINVAL; + } + + if (params->ops->f_tx_bulk == NULL) { + RTE_LOG(ERR, PIPELINE, + "%s: f_tx_bulk function pointer NULL\n", __func__); + return -EINVAL; + } + + /* Do we have room for one more port? */ + if (p->num_ports_out == RTE_PIPELINE_PORT_OUT_MAX) { + RTE_LOG(ERR, PIPELINE, + "%s: invalid value for num_ports_out\n", __func__); + return -EINVAL; + } + + return 0; +} + +int +rte_pipeline_port_in_create(struct rte_pipeline *p, + struct rte_pipeline_port_in_params *params, + uint32_t *port_id) +{ + struct rte_port_in *port; + void *h_port; + uint32_t id; + int status; + + /* Check input arguments */ + status = rte_pipeline_port_in_check_params(p, params, port_id); + if (status != 0) + return status; + + id = p->num_ports_in; + port = &p->ports_in[id]; + + /* Create the port */ + h_port = params->ops->f_create(params->arg_create, p->socket_id); + if (h_port == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: Port creation failed\n", __func__); + return -EINVAL; + } + + /* Commit current table to the pipeline */ + p->num_ports_in++; + *port_id = id; + + /* Save input parameters */ + memcpy(&port->ops, params->ops, sizeof(struct rte_port_in_ops)); + port->f_action = params->f_action; + port->arg_ah = params->arg_ah; + port->burst_size = params->burst_size; + + /* Initialize port internal data structure */ + port->table_id = RTE_TABLE_INVALID; + port->h_port = h_port; + port->next = NULL; + + return 0; +} + +void +rte_pipeline_port_in_free(struct rte_port_in *port) +{ + if (port->ops.f_free != NULL) + port->ops.f_free(port->h_port); +} + +int +rte_pipeline_port_out_create(struct rte_pipeline *p, + struct rte_pipeline_port_out_params *params, + uint32_t *port_id) +{ + struct rte_port_out *port; + void *h_port; + uint32_t id; + int status; + + /* Check input arguments */ + status = rte_pipeline_port_out_check_params(p, params, port_id); + if (status != 0) + return status; + + id = p->num_ports_out; + port = &p->ports_out[id]; + + /* Create the port */ + h_port = params->ops->f_create(params->arg_create, p->socket_id); + if (h_port == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: Port creation failed\n", __func__); + return -EINVAL; + } + + /* Commit current table to the pipeline */ + p->num_ports_out++; + *port_id = id; + + /* Save input parameters */ + memcpy(&port->ops, params->ops, sizeof(struct rte_port_out_ops)); + port->f_action = params->f_action; + port->arg_ah = params->arg_ah; + + /* Initialize port internal data structure */ + port->h_port = h_port; + + return 0; +} + +void +rte_pipeline_port_out_free(struct rte_port_out *port) +{ + if (port->ops.f_free != NULL) + port->ops.f_free(port->h_port); +} + +int +rte_pipeline_port_in_connect_to_table(struct rte_pipeline *p, + uint32_t port_id, + uint32_t table_id) +{ + struct rte_port_in *port; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (port_id >= p->num_ports_in) { + RTE_LOG(ERR, PIPELINE, + "%s: port IN ID %u is out of range\n", + __func__, port_id); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: Table ID %u is out of range\n", + __func__, table_id); + return -EINVAL; + } + + port = &p->ports_in[port_id]; + port->table_id = table_id; + + return 0; +} + +int +rte_pipeline_port_in_enable(struct rte_pipeline *p, uint32_t port_id) +{ + struct rte_port_in *port, *port_prev, *port_next; + uint64_t port_mask; + uint32_t port_prev_id, port_next_id; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (port_id >= p->num_ports_in) { + RTE_LOG(ERR, PIPELINE, + "%s: port IN ID %u is out of range\n", + __func__, port_id); + return -EINVAL; + } + + port = &p->ports_in[port_id]; + + /* Return if current input port is already enabled */ + port_mask = 1LLU << port_id; + if (p->enabled_port_in_mask & port_mask) + return 0; + + p->enabled_port_in_mask |= port_mask; + + /* Add current input port to the pipeline chain of enabled ports */ + port_prev_id = rte_mask_get_prev(p->enabled_port_in_mask, port_id); + port_next_id = rte_mask_get_next(p->enabled_port_in_mask, port_id); + + port_prev = &p->ports_in[port_prev_id]; + port_next = &p->ports_in[port_next_id]; + + port_prev->next = port; + port->next = port_next; + + /* Check if list of enabled ports was previously empty */ + if (p->enabled_port_in_mask == port_mask) + p->port_in_next = port; + + return 0; +} + +int +rte_pipeline_port_in_disable(struct rte_pipeline *p, uint32_t port_id) +{ + struct rte_port_in *port, *port_prev, *port_next; + uint64_t port_mask; + uint32_t port_prev_id, port_next_id; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (port_id >= p->num_ports_in) { + RTE_LOG(ERR, PIPELINE, "%s: port IN ID %u is out of range\n", + __func__, port_id); + return -EINVAL; + } + + port = &p->ports_in[port_id]; + + /* Return if current input port is already disabled */ + port_mask = 1LLU << port_id; + if ((p->enabled_port_in_mask & port_mask) == 0) + return 0; + + p->enabled_port_in_mask &= ~port_mask; + + /* Return if no other enabled ports */ + if (p->enabled_port_in_mask == 0) { + p->port_in_next = NULL; + + return 0; + } + + /* Add current input port to the pipeline chain of enabled ports */ + port_prev_id = rte_mask_get_prev(p->enabled_port_in_mask, port_id); + port_next_id = rte_mask_get_next(p->enabled_port_in_mask, port_id); + + port_prev = &p->ports_in[port_prev_id]; + port_next = &p->ports_in[port_next_id]; + + port_prev->next = port_next; + + /* Check if the port which has just been disabled is next to serve */ + if (port == p->port_in_next) + p->port_in_next = port_next; + + return 0; +} + +/* + * Pipeline run-time + * + */ +int +rte_pipeline_check(struct rte_pipeline *p) +{ + uint32_t port_in_id; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + /* Check that pipeline has at least one input port, one table and one + output port */ + if (p->num_ports_in == 0) { + RTE_LOG(ERR, PIPELINE, "%s: must have at least 1 input port\n", + __func__); + return -EINVAL; + } + if (p->num_tables == 0) { + RTE_LOG(ERR, PIPELINE, "%s: must have at least 1 table\n", + __func__); + return -EINVAL; + } + if (p->num_ports_out == 0) { + RTE_LOG(ERR, PIPELINE, "%s: must have at least 1 output port\n", + __func__); + return -EINVAL; + } + + /* Check that all input ports are connected */ + for (port_in_id = 0; port_in_id < p->num_ports_in; port_in_id++) { + struct rte_port_in *port_in = &p->ports_in[port_in_id]; + + if (port_in->table_id == RTE_TABLE_INVALID) { + RTE_LOG(ERR, PIPELINE, + "%s: Port IN ID %u is not connected\n", + __func__, port_in_id); + return -EINVAL; + } + } + + return 0; +} + +static inline void +rte_pipeline_compute_masks(struct rte_pipeline *p, uint64_t pkts_mask) +{ + p->action_mask1[RTE_PIPELINE_ACTION_DROP] = 0; + p->action_mask1[RTE_PIPELINE_ACTION_PORT] = 0; + p->action_mask1[RTE_PIPELINE_ACTION_PORT_META] = 0; + p->action_mask1[RTE_PIPELINE_ACTION_TABLE] = 0; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + uint64_t pkt_mask = 1LLU << i; + uint32_t pos = p->entries[i]->action; + + p->action_mask1[pos] |= pkt_mask; + } + } else { + uint32_t i; + + for (i = 0; i < RTE_PORT_IN_BURST_SIZE_MAX; i++) { + uint64_t pkt_mask = 1LLU << i; + uint32_t pos; + + if ((pkt_mask & pkts_mask) == 0) + continue; + + pos = p->entries[i]->action; + p->action_mask1[pos] |= pkt_mask; + } + } +} + +static inline void +rte_pipeline_action_handler_port_bulk(struct rte_pipeline *p, + uint64_t pkts_mask, uint32_t port_id) +{ + struct rte_port_out *port_out = &p->ports_out[port_id]; + + p->pkts_mask = pkts_mask; + + /* Output port user actions */ + if (port_out->f_action != NULL) { + port_out->f_action(p, p->pkts, pkts_mask, port_out->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + port_out->n_pkts_dropped_by_ah); + } + + /* Output port TX */ + if (p->pkts_mask != 0) + port_out->ops.f_tx_bulk(port_out->h_port, + p->pkts, + p->pkts_mask); +} + +static inline void +rte_pipeline_action_handler_port(struct rte_pipeline *p, uint64_t pkts_mask) +{ + p->pkts_mask = pkts_mask; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = p->pkts[i]; + uint32_t port_out_id = p->entries[i]->port_id; + struct rte_port_out *port_out = + &p->ports_out[port_out_id]; + + /* Output port user actions */ + if (port_out->f_action == NULL) /* Output port TX */ + port_out->ops.f_tx(port_out->h_port, pkt); + else { + uint64_t pkt_mask = 1LLU << i; + + port_out->f_action(p, + p->pkts, + pkt_mask, + port_out->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + port_out->n_pkts_dropped_by_ah); + + /* Output port TX */ + if (pkt_mask & p->pkts_mask) + port_out->ops.f_tx(port_out->h_port, + pkt); + } + } + } else { + uint32_t i; + + for (i = 0; i < RTE_PORT_IN_BURST_SIZE_MAX; i++) { + uint64_t pkt_mask = 1LLU << i; + struct rte_mbuf *pkt; + struct rte_port_out *port_out; + uint32_t port_out_id; + + if ((pkt_mask & pkts_mask) == 0) + continue; + + pkt = p->pkts[i]; + port_out_id = p->entries[i]->port_id; + port_out = &p->ports_out[port_out_id]; + + /* Output port user actions */ + if (port_out->f_action == NULL) /* Output port TX */ + port_out->ops.f_tx(port_out->h_port, pkt); + else { + port_out->f_action(p, + p->pkts, + pkt_mask, + port_out->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + port_out->n_pkts_dropped_by_ah); + + /* Output port TX */ + if (pkt_mask & p->pkts_mask) + port_out->ops.f_tx(port_out->h_port, + pkt); + } + } + } +} + +static inline void +rte_pipeline_action_handler_port_meta(struct rte_pipeline *p, + uint64_t pkts_mask) +{ + p->pkts_mask = pkts_mask; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = p->pkts[i]; + uint32_t port_out_id = + RTE_MBUF_METADATA_UINT32(pkt, + p->offset_port_id); + struct rte_port_out *port_out = &p->ports_out[ + port_out_id]; + + /* Output port user actions */ + if (port_out->f_action == NULL) /* Output port TX */ + port_out->ops.f_tx(port_out->h_port, pkt); + else { + uint64_t pkt_mask = 1LLU << i; + + port_out->f_action(p, + p->pkts, + pkt_mask, + port_out->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + port_out->n_pkts_dropped_by_ah); + + /* Output port TX */ + if (pkt_mask & p->pkts_mask) + port_out->ops.f_tx(port_out->h_port, + pkt); + } + } + } else { + uint32_t i; + + for (i = 0; i < RTE_PORT_IN_BURST_SIZE_MAX; i++) { + uint64_t pkt_mask = 1LLU << i; + struct rte_mbuf *pkt; + struct rte_port_out *port_out; + uint32_t port_out_id; + + if ((pkt_mask & pkts_mask) == 0) + continue; + + pkt = p->pkts[i]; + port_out_id = RTE_MBUF_METADATA_UINT32(pkt, + p->offset_port_id); + port_out = &p->ports_out[port_out_id]; + + /* Output port user actions */ + if (port_out->f_action == NULL) /* Output port TX */ + port_out->ops.f_tx(port_out->h_port, pkt); + else { + port_out->f_action(p, + p->pkts, + pkt_mask, + port_out->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + port_out->n_pkts_dropped_by_ah); + + /* Output port TX */ + if (pkt_mask & p->pkts_mask) + port_out->ops.f_tx(port_out->h_port, + pkt); + } + } + } +} + +static inline void +rte_pipeline_action_handler_drop(struct rte_pipeline *p, uint64_t pkts_mask) +{ + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) + rte_pktmbuf_free(p->pkts[i]); + } else { + uint32_t i; + + for (i = 0; i < RTE_PORT_IN_BURST_SIZE_MAX; i++) { + uint64_t pkt_mask = 1LLU << i; + + if ((pkt_mask & pkts_mask) == 0) + continue; + + rte_pktmbuf_free(p->pkts[i]); + } + } +} + +int +rte_pipeline_run(struct rte_pipeline *p) +{ + struct rte_port_in *port_in = p->port_in_next; + uint32_t n_pkts, table_id; + + if (port_in == NULL) + return 0; + + /* Input port RX */ + n_pkts = port_in->ops.f_rx(port_in->h_port, p->pkts, + port_in->burst_size); + if (n_pkts == 0) { + p->port_in_next = port_in->next; + return 0; + } + + p->pkts_mask = RTE_LEN2MASK(n_pkts, uint64_t); + p->action_mask0[RTE_PIPELINE_ACTION_DROP] = 0; + p->action_mask0[RTE_PIPELINE_ACTION_PORT] = 0; + p->action_mask0[RTE_PIPELINE_ACTION_PORT_META] = 0; + p->action_mask0[RTE_PIPELINE_ACTION_TABLE] = 0; + + /* Input port user actions */ + if (port_in->f_action != NULL) { + port_in->f_action(p, p->pkts, n_pkts, port_in->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + port_in->n_pkts_dropped_by_ah); + } + + /* Table */ + for (table_id = port_in->table_id; p->pkts_mask != 0; ) { + struct rte_table *table; + uint64_t lookup_hit_mask, lookup_miss_mask; + + /* Lookup */ + table = &p->tables[table_id]; + table->ops.f_lookup(table->h_table, p->pkts, p->pkts_mask, + &lookup_hit_mask, (void **) p->entries); + lookup_miss_mask = p->pkts_mask & (~lookup_hit_mask); + + /* Lookup miss */ + if (lookup_miss_mask != 0) { + struct rte_pipeline_table_entry *default_entry = + table->default_entry; + + p->pkts_mask = lookup_miss_mask; + + /* Table user actions */ + if (table->f_action_miss != NULL) { + table->f_action_miss(p, + p->pkts, + lookup_miss_mask, + default_entry, + table->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + table->n_pkts_dropped_by_lkp_miss_ah); + } + + /* Table reserved actions */ + if ((default_entry->action == RTE_PIPELINE_ACTION_PORT) && + (p->pkts_mask != 0)) + rte_pipeline_action_handler_port_bulk(p, + p->pkts_mask, + default_entry->port_id); + else { + uint32_t pos = default_entry->action; + + RTE_PIPELINE_STATS_TABLE_DROP0(p); + + p->action_mask0[pos] |= p->pkts_mask; + + RTE_PIPELINE_STATS_TABLE_DROP1(p, + table->n_pkts_dropped_lkp_miss); + } + } + + /* Lookup hit */ + if (lookup_hit_mask != 0) { + p->pkts_mask = lookup_hit_mask; + + /* Table user actions */ + if (table->f_action_hit != NULL) { + table->f_action_hit(p, + p->pkts, + lookup_hit_mask, + p->entries, + table->arg_ah); + + RTE_PIPELINE_STATS_AH_DROP_READ(p, + table->n_pkts_dropped_by_lkp_hit_ah); + } + + /* Table reserved actions */ + RTE_PIPELINE_STATS_TABLE_DROP0(p); + rte_pipeline_compute_masks(p, p->pkts_mask); + p->action_mask0[RTE_PIPELINE_ACTION_DROP] |= + p->action_mask1[ + RTE_PIPELINE_ACTION_DROP]; + p->action_mask0[RTE_PIPELINE_ACTION_PORT] |= + p->action_mask1[ + RTE_PIPELINE_ACTION_PORT]; + p->action_mask0[RTE_PIPELINE_ACTION_PORT_META] |= + p->action_mask1[ + RTE_PIPELINE_ACTION_PORT_META]; + p->action_mask0[RTE_PIPELINE_ACTION_TABLE] |= + p->action_mask1[ + RTE_PIPELINE_ACTION_TABLE]; + + RTE_PIPELINE_STATS_TABLE_DROP1(p, + table->n_pkts_dropped_lkp_hit); + } + + /* Prepare for next iteration */ + p->pkts_mask = p->action_mask0[RTE_PIPELINE_ACTION_TABLE]; + table_id = table->table_next_id; + p->action_mask0[RTE_PIPELINE_ACTION_TABLE] = 0; + } + + /* Table reserved action PORT */ + rte_pipeline_action_handler_port(p, + p->action_mask0[RTE_PIPELINE_ACTION_PORT]); + + /* Table reserved action PORT META */ + rte_pipeline_action_handler_port_meta(p, + p->action_mask0[RTE_PIPELINE_ACTION_PORT_META]); + + /* Table reserved action DROP */ + rte_pipeline_action_handler_drop(p, + p->action_mask0[RTE_PIPELINE_ACTION_DROP]); + + /* Pick candidate for next port IN to serve */ + p->port_in_next = port_in->next; + + return (int) n_pkts; +} + +int +rte_pipeline_flush(struct rte_pipeline *p) +{ + uint32_t port_id; + + /* Check input arguments */ + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + for (port_id = 0; port_id < p->num_ports_out; port_id++) { + struct rte_port_out *port = &p->ports_out[port_id]; + + if (port->ops.f_flush != NULL) + port->ops.f_flush(port->h_port); + } + + return 0; +} + +int +rte_pipeline_port_out_packet_insert(struct rte_pipeline *p, + uint32_t port_id, struct rte_mbuf *pkt) +{ + struct rte_port_out *port_out = &p->ports_out[port_id]; + + port_out->ops.f_tx(port_out->h_port, pkt); /* Output port TX */ + + return 0; +} + +int rte_pipeline_ah_packet_hijack(struct rte_pipeline *p, + uint64_t pkts_mask) +{ + pkts_mask &= p->pkts_mask; + p->pkts_mask &= ~pkts_mask; + + return 0; +} + +int rte_pipeline_ah_packet_drop(struct rte_pipeline *p, + uint64_t pkts_mask) +{ + pkts_mask &= p->pkts_mask; + p->pkts_mask &= ~pkts_mask; + p->action_mask0[RTE_PIPELINE_ACTION_DROP] |= pkts_mask; + + RTE_PIPELINE_STATS_AH_DROP_WRITE(p, pkts_mask); + return 0; +} + +int rte_pipeline_port_in_stats_read(struct rte_pipeline *p, uint32_t port_id, + struct rte_pipeline_port_in_stats *stats, int clear) +{ + struct rte_port_in *port; + int retval; + + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (port_id >= p->num_ports_in) { + RTE_LOG(ERR, PIPELINE, + "%s: port IN ID %u is out of range\n", + __func__, port_id); + return -EINVAL; + } + + port = &p->ports_in[port_id]; + + if (port->ops.f_stats != NULL) { + retval = port->ops.f_stats(port->h_port, &stats->stats, clear); + if (retval) + return retval; + } else if (stats != NULL) + memset(&stats->stats, 0, sizeof(stats->stats)); + + if (stats != NULL) + stats->n_pkts_dropped_by_ah = port->n_pkts_dropped_by_ah; + + if (clear != 0) + port->n_pkts_dropped_by_ah = 0; + + return 0; +} + +int rte_pipeline_port_out_stats_read(struct rte_pipeline *p, uint32_t port_id, + struct rte_pipeline_port_out_stats *stats, int clear) +{ + struct rte_port_out *port; + int retval; + + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", __func__); + return -EINVAL; + } + + if (port_id >= p->num_ports_out) { + RTE_LOG(ERR, PIPELINE, + "%s: port OUT ID %u is out of range\n", __func__, port_id); + return -EINVAL; + } + + port = &p->ports_out[port_id]; + if (port->ops.f_stats != NULL) { + retval = port->ops.f_stats(port->h_port, &stats->stats, clear); + if (retval != 0) + return retval; + } else if (stats != NULL) + memset(&stats->stats, 0, sizeof(stats->stats)); + + if (stats != NULL) + stats->n_pkts_dropped_by_ah = port->n_pkts_dropped_by_ah; + + if (clear != 0) + port->n_pkts_dropped_by_ah = 0; + + return 0; +} + +int rte_pipeline_table_stats_read(struct rte_pipeline *p, uint32_t table_id, + struct rte_pipeline_table_stats *stats, int clear) +{ + struct rte_table *table; + int retval; + + if (p == NULL) { + RTE_LOG(ERR, PIPELINE, "%s: pipeline parameter NULL\n", + __func__); + return -EINVAL; + } + + if (table_id >= p->num_tables) { + RTE_LOG(ERR, PIPELINE, + "%s: table %u is out of range\n", __func__, table_id); + return -EINVAL; + } + + table = &p->tables[table_id]; + if (table->ops.f_stats != NULL) { + retval = table->ops.f_stats(table->h_table, &stats->stats, clear); + if (retval != 0) + return retval; + } else if (stats != NULL) + memset(&stats->stats, 0, sizeof(stats->stats)); + + if (stats != NULL) { + stats->n_pkts_dropped_by_lkp_hit_ah = + table->n_pkts_dropped_by_lkp_hit_ah; + stats->n_pkts_dropped_by_lkp_miss_ah = + table->n_pkts_dropped_by_lkp_miss_ah; + stats->n_pkts_dropped_lkp_hit = table->n_pkts_dropped_lkp_hit; + stats->n_pkts_dropped_lkp_miss = table->n_pkts_dropped_lkp_miss; + } + + if (clear != 0) { + table->n_pkts_dropped_by_lkp_hit_ah = 0; + table->n_pkts_dropped_by_lkp_miss_ah = 0; + table->n_pkts_dropped_lkp_hit = 0; + table->n_pkts_dropped_lkp_miss = 0; + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.h b/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.h new file mode 100644 index 00000000..3cfb6868 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline.h @@ -0,0 +1,848 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PIPELINE_H__ +#define __INCLUDE_RTE_PIPELINE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Pipeline + * + * This tool is part of the DPDK Packet Framework tool suite and provides + * a standard methodology (logically similar to OpenFlow) for rapid development + * of complex packet processing pipelines out of ports, tables and actions. + * + * Basic operation. A pipeline is constructed by connecting its input + * ports to its output ports through a chain of lookup tables. As result of + * lookup operation into the current table, one of the table entries (or the + * default table entry, in case of lookup miss) is identified to provide the + * actions to be executed on the current packet and the associated action + * meta-data. The behavior of user actions is defined through the configurable + * table action handler, while the reserved actions define the next hop for the + * current packet (either another table, an output port or packet drop) and are + * handled transparently by the framework. + * + * Initialization and run-time flows. Once all the pipeline elements + * (input ports, tables, output ports) have been created, input ports connected + * to tables, table action handlers configured, tables populated with the + * initial set of entries (actions and action meta-data) and input ports + * enabled, the pipeline runs automatically, pushing packets from input ports + * to tables and output ports. At each table, the identified user actions are + * being executed, resulting in action meta-data (stored in the table entry) + * and packet meta-data (stored with the packet descriptor) being updated. The + * pipeline tables can have further updates and input ports can be disabled or + * enabled later on as required. + * + * Multi-core scaling. Typically, each CPU core will run its own + * pipeline instance. Complex application-level pipelines can be implemented by + * interconnecting multiple CPU core-level pipelines in tree-like topologies, + * as the same port devices (e.g. SW rings) can serve as output ports for the + * pipeline running on CPU core A, as well as input ports for the pipeline + * running on CPU core B. This approach enables the application development + * using the pipeline (CPU cores connected serially), cluster/run-to-completion + * (CPU cores connected in parallel) or mixed (pipeline of CPU core clusters) + * programming models. + * + * Thread safety. It is possible to have multiple pipelines running on + * the same CPU core, but it is not allowed (for thread safety reasons) to have + * multiple CPU cores running the same pipeline instance. + * + ***/ + +#include + +#include +#include +#include + +struct rte_mbuf; + +/* + * Pipeline + * + */ +/** Opaque data type for pipeline */ +struct rte_pipeline; + +/** Parameters for pipeline creation */ +struct rte_pipeline_params { + /** Pipeline name */ + const char *name; + + /** CPU socket ID where memory for the pipeline and its elements (ports + and tables) should be allocated */ + int socket_id; + + /** Offset within packet meta-data to port_id to be used by action + "Send packet to output port read from packet meta-data". Has to be + 4-byte aligned. */ + uint32_t offset_port_id; +}; + +/** Pipeline port in stats. */ +struct rte_pipeline_port_in_stats { + /** Port in stats. */ + struct rte_port_in_stats stats; + + /** Number of packets dropped by action handler. */ + uint64_t n_pkts_dropped_by_ah; + +}; + +/** Pipeline port out stats. */ +struct rte_pipeline_port_out_stats { + /** Port out stats. */ + struct rte_port_out_stats stats; + + /** Number of packets dropped by action handler. */ + uint64_t n_pkts_dropped_by_ah; +}; + +/** Pipeline table stats. */ +struct rte_pipeline_table_stats { + /** Table stats. */ + struct rte_table_stats stats; + + /** Number of packets dropped by lookup hit action handler. */ + uint64_t n_pkts_dropped_by_lkp_hit_ah; + + /** Number of packets dropped by lookup miss action handler. */ + uint64_t n_pkts_dropped_by_lkp_miss_ah; + + /** Number of packets dropped by pipeline in behalf of this + * table based on action specified in table entry. */ + uint64_t n_pkts_dropped_lkp_hit; + + /** Number of packets dropped by pipeline in behalf of this + * table based on action specified in table entry. */ + uint64_t n_pkts_dropped_lkp_miss; +}; + +/** + * Pipeline create + * + * @param params + * Parameters for pipeline creation + * @return + * Handle to pipeline instance on success or NULL otherwise + */ +struct rte_pipeline *rte_pipeline_create(struct rte_pipeline_params *params); + +/** + * Pipeline free + * + * @param p + * Handle to pipeline instance + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_free(struct rte_pipeline *p); + +/** + * Pipeline consistency check + * + * @param p + * Handle to pipeline instance + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_check(struct rte_pipeline *p); + +/** + * Pipeline run + * + * @param p + * Handle to pipeline instance + * @return + * Number of packets read and processed + */ +int rte_pipeline_run(struct rte_pipeline *p); + +/** + * Pipeline flush + * + * @param p + * Handle to pipeline instance + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_flush(struct rte_pipeline *p); + +/* + * Actions + * + */ +/** Reserved actions */ +enum rte_pipeline_action { + /** Drop the packet */ + RTE_PIPELINE_ACTION_DROP = 0, + + /** Send packet to output port */ + RTE_PIPELINE_ACTION_PORT, + + /** Send packet to output port read from packet meta-data */ + RTE_PIPELINE_ACTION_PORT_META, + + /** Send packet to table */ + RTE_PIPELINE_ACTION_TABLE, + + /** Number of reserved actions */ + RTE_PIPELINE_ACTIONS +}; + +/* + * Table + * + */ +/** Maximum number of tables allowed for any given pipeline instance. The + value of this parameter cannot be changed. */ +#define RTE_PIPELINE_TABLE_MAX 64 + +/** + * Head format for the table entry of any pipeline table. For any given + * pipeline table, all table entries should have the same size and format. For + * any given pipeline table, the table entry has to start with a head of this + * structure, which contains the reserved actions and their associated + * meta-data, and then optionally continues with user actions and their + * associated meta-data. As all the currently defined reserved actions are + * mutually exclusive, only one reserved action can be set per table entry. + */ +struct rte_pipeline_table_entry { + /** Reserved action */ + enum rte_pipeline_action action; + + RTE_STD_C11 + union { + /** Output port ID (meta-data for "Send packet to output port" + action) */ + uint32_t port_id; + /** Table ID (meta-data for "Send packet to table" action) */ + uint32_t table_id; + }; + /** Start of table entry area for user defined actions and meta-data */ + __extension__ uint8_t action_data[0]; +}; + +/** + * Pipeline table action handler on lookup hit + * + * The action handler can decide to drop packets by resetting the associated + * packet bit in the pkts_mask parameter. In this case, the action handler is + * required not to free the packet buffer, which will be freed eventually by + * the pipeline. + * + * @param p + * Handle to pipeline instance + * @param pkts + * Burst of input packets specified as array of up to 64 pointers to struct + * rte_mbuf + * @param pkts_mask + * 64-bit bitmask specifying which packets in the input burst are valid. When + * pkts_mask bit n is set, then element n of pkts array is pointing to a + * valid packet and element n of entries array is pointing to a valid table + * entry associated with the packet, with the association typically done by + * the table lookup operation. Otherwise, element n of pkts array and element + * n of entries array will not be accessed. + * @param entries + * Set of table entries specified as array of up to 64 pointers to struct + * rte_pipeline_table_entry + * @param arg + * Opaque parameter registered by the user at the pipeline table creation + * time + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_pipeline_table_action_handler_hit)( + struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_pipeline_table_entry **entries, + void *arg); + +/** + * Pipeline table action handler on lookup miss + * + * The action handler can decide to drop packets by resetting the associated + * packet bit in the pkts_mask parameter. In this case, the action handler is + * required not to free the packet buffer, which will be freed eventually by + * the pipeline. + * + * @param p + * Handle to pipeline instance + * @param pkts + * Burst of input packets specified as array of up to 64 pointers to struct + * rte_mbuf + * @param pkts_mask + * 64-bit bitmask specifying which packets in the input burst are valid. When + * pkts_mask bit n is set, then element n of pkts array is pointing to a + * valid packet. Otherwise, element n of pkts array will not be accessed. + * @param entry + * Single table entry associated with all the valid packets from the input + * burst, specified as pointer to struct rte_pipeline_table_entry. + * This entry is the pipeline table default entry that is associated by the + * table lookup operation with the input packets that have resulted in lookup + * miss. + * @param arg + * Opaque parameter registered by the user at the pipeline table creation + * time + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_pipeline_table_action_handler_miss)( + struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_pipeline_table_entry *entry, + void *arg); + +/** Parameters for pipeline table creation. Action handlers have to be either + both enabled or both disabled (they can be disabled by setting them to + NULL). */ +struct rte_pipeline_table_params { + /** Table operations (specific to each table type) */ + struct rte_table_ops *ops; + /** Opaque param to be passed to the table create operation when + invoked */ + void *arg_create; + /** Callback function to execute the user actions on input packets in + case of lookup hit */ + rte_pipeline_table_action_handler_hit f_action_hit; + /** Callback function to execute the user actions on input packets in + case of lookup miss */ + rte_pipeline_table_action_handler_miss f_action_miss; + + /** Opaque parameter to be passed to lookup hit and/or lookup miss + action handlers when invoked */ + void *arg_ah; + /** Memory size to be reserved per table entry for storing the user + actions and their meta-data */ + uint32_t action_data_size; +}; + +/** + * Pipeline table create + * + * @param p + * Handle to pipeline instance + * @param params + * Parameters for pipeline table creation + * @param table_id + * Table ID. Valid only within the scope of table IDs of the current + * pipeline. Only returned after a successful invocation. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_create(struct rte_pipeline *p, + struct rte_pipeline_table_params *params, + uint32_t *table_id); + +/** + * Pipeline table default entry add + * + * The contents of the table default entry is updated with the provided actions + * and meta-data. When the default entry is not configured (by using this + * function), the built-in default entry has the action "Drop" and meta-data + * set to all-zeros. + * + * @param p + * Handle to pipeline instance + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @param default_entry + * New contents for the table default entry + * @param default_entry_ptr + * On successful invocation, pointer to the default table entry which can be + * used for further read-write accesses to this table entry. This pointer + * is valid until the default entry is deleted or re-added. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_default_entry_add(struct rte_pipeline *p, + uint32_t table_id, + struct rte_pipeline_table_entry *default_entry, + struct rte_pipeline_table_entry **default_entry_ptr); + +/** + * Pipeline table default entry delete + * + * The new contents of the table default entry is set to reserved action "Drop + * the packet" with meta-data cleared (i.e. set to all-zeros). + * + * @param p + * Handle to pipeline instance + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @param entry + * On successful invocation, when entry points to a valid buffer, the + * previous contents of the table default entry (as it was just before the + * delete operation) is copied to this buffer + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_default_entry_delete(struct rte_pipeline *p, + uint32_t table_id, + struct rte_pipeline_table_entry *entry); + +/** + * Pipeline table entry add + * + * @param p + * Handle to pipeline instance + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @param key + * Table entry key + * @param entry + * New contents for the table entry identified by key + * @param key_found + * On successful invocation, set to TRUE (value different than 0) if key was + * already present in the table before the add operation and to FALSE (value + * 0) if not + * @param entry_ptr + * On successful invocation, pointer to the table entry associated with key. + * This can be used for further read-write accesses to this table entry and + * is valid until the key is deleted from the table or re-added (usually for + * associating different actions and/or action meta-data to the current key) + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_entry_add(struct rte_pipeline *p, + uint32_t table_id, + void *key, + struct rte_pipeline_table_entry *entry, + int *key_found, + struct rte_pipeline_table_entry **entry_ptr); + +/** + * Pipeline table entry delete + * + * @param p + * Handle to pipeline instance + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @param key + * Table entry key + * @param key_found + * On successful invocation, set to TRUE (value different than 0) if key was + * found in the table before the delete operation and to FALSE (value 0) if + * not + * @param entry + * On successful invocation, when key is found in the table and entry points + * to a valid buffer, the table entry contents (as it was before the delete + * was performed) is copied to this buffer + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_entry_delete(struct rte_pipeline *p, + uint32_t table_id, + void *key, + int *key_found, + struct rte_pipeline_table_entry *entry); + +/** + * Pipeline table entry add bulk + * + * @param p + * Handle to pipeline instance + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @param keys + * Array containing table entry keys + * @param entries + * Array containing new contents for every table entry identified by key + * @param n_keys + * Number of keys to add + * @param key_found + * On successful invocation, key_found for every item in the array is set to + * TRUE (value different than 0) if key was already present in the table + * before the add operation and to FALSE (value 0) if not + * @param entries_ptr + * On successful invocation, array *entries_ptr stores pointer to every table + * entry associated with key. This can be used for further read-write accesses + * to this table entry and is valid until the key is deleted from the table or + * re-added (usually for associating different actions and/or action meta-data + * to the current key) + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_entry_add_bulk(struct rte_pipeline *p, + uint32_t table_id, + void **keys, + struct rte_pipeline_table_entry **entries, + uint32_t n_keys, + int *key_found, + struct rte_pipeline_table_entry **entries_ptr); + +/** + * Pipeline table entry delete bulk + * + * @param p + * Handle to pipeline instance + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @param keys + * Array containing table entry keys + * @param n_keys + * Number of keys to delete + * @param key_found + * On successful invocation, key_found for every item in the array is set to + * TRUE (value different than 0) if key was found in the table before the + * delete operation and to FALSE (value 0) if not + * @param entries + * If entries pointer is NULL, this pointer is ignored for every entry found. + * Else, after successful invocation, if specific key is found in the table + * and entry points to a valid buffer, the table entry contents (as it was + * before the delete was performed) is copied to this buffer. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_entry_delete_bulk(struct rte_pipeline *p, + uint32_t table_id, + void **keys, + uint32_t n_keys, + int *key_found, + struct rte_pipeline_table_entry **entries); + +/** + * Read pipeline table stats. + * + * This function reads table statistics identified by *table_id* of given + * pipeline *p*. + * + * @param p + * Handle to pipeline instance. + * @param table_id + * Port ID what stats will be returned. + * @param stats + * Statistics buffer. + * @param clear + * If not 0 clear stats after reading. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_table_stats_read(struct rte_pipeline *p, uint32_t table_id, + struct rte_pipeline_table_stats *stats, int clear); + +/* + * Port IN + * + */ +/** Maximum number of input ports allowed for any given pipeline instance. The + value of this parameter cannot be changed. */ +#define RTE_PIPELINE_PORT_IN_MAX 64 + +/** + * Pipeline input port action handler + * + * The action handler can decide to drop packets by resetting the associated + * packet bit in the pkts_mask parameter. In this case, the action handler is + * required not to free the packet buffer, which will be freed eventually by + * the pipeline. + * + * @param p + * Handle to pipeline instance + * @param pkts + * Burst of input packets specified as array of up to 64 pointers to struct + * rte_mbuf + * @param n + * Number of packets in the input burst. This parameter specifies that + * elements 0 to (n-1) of pkts array are valid. + * @param arg + * Opaque parameter registered by the user at the pipeline table creation + * time + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_pipeline_port_in_action_handler)( + struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint32_t n, + void *arg); + +/** Parameters for pipeline input port creation */ +struct rte_pipeline_port_in_params { + /** Input port operations (specific to each table type) */ + struct rte_port_in_ops *ops; + /** Opaque parameter to be passed to create operation when invoked */ + void *arg_create; + + /** Callback function to execute the user actions on input packets. + Disabled if set to NULL. */ + rte_pipeline_port_in_action_handler f_action; + /** Opaque parameter to be passed to the action handler when invoked */ + void *arg_ah; + + /** Recommended burst size for the RX operation(in number of pkts) */ + uint32_t burst_size; +}; + +/** + * Pipeline input port create + * + * @param p + * Handle to pipeline instance + * @param params + * Parameters for pipeline input port creation + * @param port_id + * Input port ID. Valid only within the scope of input port IDs of the + * current pipeline. Only returned after a successful invocation. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_in_create(struct rte_pipeline *p, + struct rte_pipeline_port_in_params *params, + uint32_t *port_id); + +/** + * Pipeline input port connect to table + * + * @param p + * Handle to pipeline instance + * @param port_id + * Port ID (returned by previous invocation of pipeline input port create) + * @param table_id + * Table ID (returned by previous invocation of pipeline table create) + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_in_connect_to_table(struct rte_pipeline *p, + uint32_t port_id, + uint32_t table_id); + +/** + * Pipeline input port enable + * + * @param p + * Handle to pipeline instance + * @param port_id + * Port ID (returned by previous invocation of pipeline input port create) + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_in_enable(struct rte_pipeline *p, + uint32_t port_id); + +/** + * Pipeline input port disable + * + * @param p + * Handle to pipeline instance + * @param port_id + * Port ID (returned by previous invocation of pipeline input port create) + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_in_disable(struct rte_pipeline *p, + uint32_t port_id); + +/** + * Read pipeline port in stats. + * + * This function reads port in statistics identified by *port_id* of given + * pipeline *p*. + * + * @param p + * Handle to pipeline instance. + * @param port_id + * Port ID what stats will be returned. + * @param stats + * Statistics buffer. + * @param clear + * If not 0 clear stats after reading. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_in_stats_read(struct rte_pipeline *p, uint32_t port_id, + struct rte_pipeline_port_in_stats *stats, int clear); + +/* + * Port OUT + * + */ +/** Maximum number of output ports allowed for any given pipeline instance. The + value of this parameter cannot be changed. */ +#define RTE_PIPELINE_PORT_OUT_MAX 64 + +/** + * Pipeline output port action handler + * + * The action handler can decide to drop packets by resetting the associated + * packet bit in the pkts_mask parameter. In this case, the action handler is + * required not to free the packet buffer, which will be freed eventually by + * the pipeline. + * + * @param p + * Handle to pipeline instance + * @param pkts + * Burst of input packets specified as array of up to 64 pointers to struct + * rte_mbuf + * @param pkts_mask + * 64-bit bitmask specifying which packets in the input burst are valid. When + * pkts_mask bit n is set, then element n of pkts array is pointing to a + * valid packet. Otherwise, element n of pkts array will not be accessed. + * @param arg + * Opaque parameter registered by the user at the pipeline table creation + * time + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_pipeline_port_out_action_handler)( + struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + void *arg); + +/** Parameters for pipeline output port creation. The action handlers have to +be either both enabled or both disabled (by setting them to NULL). When +enabled, the pipeline selects between them at different moments, based on the +number of packets that have to be sent to the same output port. */ +struct rte_pipeline_port_out_params { + /** Output port operations (specific to each table type) */ + struct rte_port_out_ops *ops; + /** Opaque parameter to be passed to create operation when invoked */ + void *arg_create; + + /** Callback function executing the user actions on bust of input + packets */ + rte_pipeline_port_out_action_handler f_action; + /** Opaque parameter to be passed to the action handler when invoked */ + void *arg_ah; +}; + +/** + * Pipeline output port create + * + * @param p + * Handle to pipeline instance + * @param params + * Parameters for pipeline output port creation + * @param port_id + * Output port ID. Valid only within the scope of output port IDs of the + * current pipeline. Only returned after a successful invocation. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_out_create(struct rte_pipeline *p, + struct rte_pipeline_port_out_params *params, + uint32_t *port_id); + +/** + * Read pipeline port out stats. + * + * This function reads port out statistics identified by *port_id* of given + * pipeline *p*. + * + * @param p + * Handle to pipeline instance. + * @param port_id + * Port ID what stats will be returned. + * @param stats + * Statistics buffer. + * @param clear + * If not 0 clear stats after reading. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_out_stats_read(struct rte_pipeline *p, uint32_t port_id, + struct rte_pipeline_port_out_stats *stats, int clear); + +/* + * Functions to be called as part of the port IN/OUT or table action handlers + * + */ +/** + * Action handler packet insert to output port + * + * This function can be called by any input/output port or table action handler + * to send a packet out through one of the pipeline output ports. This packet is + * generated by the action handler, i.e. this packet is not part of the burst of + * packets read from one of the pipeline input ports and currently processed by + * the pipeline (this packet is not an element of the pkts array input parameter + * of the action handler). + * + * @param p + * Handle to pipeline instance + * @param port_id + * Output port ID (returned by previous invocation of pipeline output port + * create) to send the packet specified by pkt + * @param pkt + * New packet generated by the action handler + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_port_out_packet_insert(struct rte_pipeline *p, + uint32_t port_id, + struct rte_mbuf *pkt); + +#define rte_pipeline_ah_port_out_packet_insert \ + rte_pipeline_port_out_packet_insert + +/** + * Action handler packet hijack + * + * This function can be called by any input/output port or table action handler + * to hijack selected packets from the burst of packets read from one of the + * pipeline input ports and currently processed by the pipeline. The hijacked + * packets are removed from any further pipeline processing, with the action + * handler now having the full ownership for these packets. + * + * The action handler can further send the hijacked packets out through any + * pipeline output port by calling the rte_pipeline_ah_port_out_packet_insert() + * function. The action handler can also drop these packets by calling the + * rte_pktmbuf_free() function, although a better alternative is provided by + * the action handler using the rte_pipeline_ah_packet_drop() function. + * + * @param p + * Handle to pipeline instance + * @param pkts_mask + * 64-bit bitmask specifying which of the packets handed over for processing + * to the action handler is to be hijacked by the action handler. When + * pkts_mask bit n is set, then element n of the pkts array (input argument to + * the action handler) is hijacked. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_ah_packet_hijack(struct rte_pipeline *p, + uint64_t pkts_mask); + +/** + * Action handler packet drop + * + * This function is called by the pipeline action handlers (port in/out, table) + * to drop the packets selected using packet mask. + * + * This function can be called by any input/output port or table action handler + * to drop selected packets from the burst of packets read from one of the + * pipeline input ports and currently processed by the pipeline. The dropped + * packets are removed from any further pipeline processing and the packet + * buffers are eventually freed to their buffer pool. + * + * This function updates the drop statistics counters correctly, therefore the + * recommended approach for dropping packets by the action handlers is to call + * this function as opposed to the action handler hijacking the packets first + * and then dropping them invisibly to the pipeline (by using the + * rte_pktmbuf_free() function). + * + * @param p + * Handle to pipeline instance + * @param pkts_mask + * 64-bit bitmask specifying which of the packets handed over for processing + * to the action handler is to be dropped by the action handler. When + * pkts_mask bit n is set, then element n of the pkts array (input argument to + * the action handler) is dropped. + * @return + * 0 on success, error code otherwise + */ +int rte_pipeline_ah_packet_drop(struct rte_pipeline *p, + uint64_t pkts_mask); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline_version.map b/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline_version.map new file mode 100644 index 00000000..d820b22f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_pipeline_version.map @@ -0,0 +1,75 @@ +DPDK_2.0 { + global: + + rte_pipeline_check; + rte_pipeline_create; + rte_pipeline_flush; + rte_pipeline_free; + rte_pipeline_port_in_connect_to_table; + rte_pipeline_port_in_create; + rte_pipeline_port_in_disable; + rte_pipeline_port_in_enable; + rte_pipeline_port_out_create; + rte_pipeline_port_out_packet_insert; + rte_pipeline_run; + rte_pipeline_table_create; + rte_pipeline_table_default_entry_add; + rte_pipeline_table_default_entry_delete; + rte_pipeline_table_entry_add; + rte_pipeline_table_entry_delete; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_pipeline_port_in_stats_read; + rte_pipeline_port_out_stats_read; + rte_pipeline_table_stats_read; + +} DPDK_2.0; + +DPDK_2.2 { + global: + + rte_pipeline_table_entry_add_bulk; + rte_pipeline_table_entry_delete_bulk; + +} DPDK_2.1; + +DPDK_16.04 { + global: + + rte_pipeline_ah_packet_hijack; + rte_pipeline_ah_packet_drop; + +} DPDK_2.2; + +EXPERIMENTAL { + global: + + rte_port_in_action_apply; + rte_port_in_action_create; + rte_port_in_action_free; + rte_port_in_action_params_get; + rte_port_in_action_profile_action_register; + rte_port_in_action_profile_create; + rte_port_in_action_profile_free; + rte_port_in_action_profile_freeze; + rte_table_action_apply; + rte_table_action_create; + rte_table_action_dscp_table_update; + rte_table_action_free; + rte_table_action_meter_profile_add; + rte_table_action_meter_profile_delete; + rte_table_action_meter_read; + rte_table_action_profile_action_register; + rte_table_action_profile_create; + rte_table_action_profile_free; + rte_table_action_profile_freeze; + rte_table_action_table_params_get; + rte_table_action_stats_read; + rte_table_action_time_read; + rte_table_action_ttl_read; +}; diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.c b/src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.c new file mode 100644 index 00000000..e3b00df8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.c @@ -0,0 +1,531 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include + +#include "rte_port_in_action.h" + +/** + * RTE_PORT_IN_ACTION_FLTR + */ +static int +fltr_cfg_check(struct rte_port_in_action_fltr_config *cfg) +{ + if (cfg == NULL) + return -1; + + return 0; +} + +struct fltr_data { + uint32_t port_id; +}; + +static void +fltr_init(struct fltr_data *data, + struct rte_port_in_action_fltr_config *cfg) +{ + data->port_id = cfg->port_id; +} + +static int +fltr_apply(struct fltr_data *data, + struct rte_port_in_action_fltr_params *p) +{ + /* Check input arguments */ + if (p == NULL) + return -1; + + data->port_id = p->port_id; + + return 0; +} + +/** + * RTE_PORT_IN_ACTION_LB + */ +static int +lb_cfg_check(struct rte_port_in_action_lb_config *cfg) +{ + if ((cfg == NULL) || + (cfg->key_size < RTE_PORT_IN_ACTION_LB_KEY_SIZE_MIN) || + (cfg->key_size > RTE_PORT_IN_ACTION_LB_KEY_SIZE_MAX) || + (!rte_is_power_of_2(cfg->key_size)) || + (cfg->f_hash == NULL)) + return -1; + + return 0; +} + +struct lb_data { + uint32_t port_id[RTE_PORT_IN_ACTION_LB_TABLE_SIZE]; +}; + +static void +lb_init(struct lb_data *data, + struct rte_port_in_action_lb_config *cfg) +{ + memcpy(data->port_id, cfg->port_id, sizeof(cfg->port_id)); +} + +static int +lb_apply(struct lb_data *data, + struct rte_port_in_action_lb_params *p) +{ + /* Check input arguments */ + if (p == NULL) + return -1; + + memcpy(data->port_id, p->port_id, sizeof(p->port_id)); + + return 0; +} + +/** + * Action profile + */ +static int +action_valid(enum rte_port_in_action_type action) +{ + switch (action) { + case RTE_PORT_IN_ACTION_FLTR: + case RTE_PORT_IN_ACTION_LB: + return 1; + default: + return 0; + } +} + +#define RTE_PORT_IN_ACTION_MAX 64 + +struct ap_config { + uint64_t action_mask; + struct rte_port_in_action_fltr_config fltr; + struct rte_port_in_action_lb_config lb; +}; + +static size_t +action_cfg_size(enum rte_port_in_action_type action) +{ + switch (action) { + case RTE_PORT_IN_ACTION_FLTR: + return sizeof(struct rte_port_in_action_fltr_config); + case RTE_PORT_IN_ACTION_LB: + return sizeof(struct rte_port_in_action_lb_config); + default: + return 0; + } +} + +static void* +action_cfg_get(struct ap_config *ap_config, + enum rte_port_in_action_type type) +{ + switch (type) { + case RTE_PORT_IN_ACTION_FLTR: + return &ap_config->fltr; + + case RTE_PORT_IN_ACTION_LB: + return &ap_config->lb; + + default: + return NULL; + } +} + +static void +action_cfg_set(struct ap_config *ap_config, + enum rte_port_in_action_type type, + void *action_cfg) +{ + void *dst = action_cfg_get(ap_config, type); + + if (dst) + memcpy(dst, action_cfg, action_cfg_size(type)); + + ap_config->action_mask |= 1LLU << type; +} + +struct ap_data { + size_t offset[RTE_PORT_IN_ACTION_MAX]; + size_t total_size; +}; + +static size_t +action_data_size(enum rte_port_in_action_type action, + struct ap_config *ap_config __rte_unused) +{ + switch (action) { + case RTE_PORT_IN_ACTION_FLTR: + return sizeof(struct fltr_data); + + case RTE_PORT_IN_ACTION_LB: + return sizeof(struct lb_data); + + default: + return 0; + } +} + +static void +action_data_offset_set(struct ap_data *ap_data, + struct ap_config *ap_config) +{ + uint64_t action_mask = ap_config->action_mask; + size_t offset; + uint32_t action; + + memset(ap_data->offset, 0, sizeof(ap_data->offset)); + + offset = 0; + for (action = 0; action < RTE_PORT_IN_ACTION_MAX; action++) + if (action_mask & (1LLU << action)) { + ap_data->offset[action] = offset; + offset += action_data_size((enum rte_port_in_action_type)action, + ap_config); + } + + ap_data->total_size = offset; +} + +struct rte_port_in_action_profile { + struct ap_config cfg; + struct ap_data data; + int frozen; +}; + +struct rte_port_in_action_profile * +rte_port_in_action_profile_create(uint32_t socket_id) +{ + struct rte_port_in_action_profile *ap; + + /* Memory allocation */ + ap = rte_zmalloc_socket(NULL, + sizeof(struct rte_port_in_action_profile), + RTE_CACHE_LINE_SIZE, + socket_id); + if (ap == NULL) + return NULL; + + return ap; +} + +int +rte_port_in_action_profile_action_register(struct rte_port_in_action_profile *profile, + enum rte_port_in_action_type type, + void *action_config) +{ + int status; + + /* Check input arguments */ + if ((profile == NULL) || + profile->frozen || + (action_valid(type) == 0) || + (profile->cfg.action_mask & (1LLU << type)) || + ((action_cfg_size(type) == 0) && action_config) || + (action_cfg_size(type) && (action_config == NULL))) + return -EINVAL; + + switch (type) { + case RTE_PORT_IN_ACTION_FLTR: + status = fltr_cfg_check(action_config); + break; + + case RTE_PORT_IN_ACTION_LB: + status = lb_cfg_check(action_config); + break; + + default: + status = 0; + break; + } + + if (status) + return status; + + /* Action enable */ + action_cfg_set(&profile->cfg, type, action_config); + + return 0; +} + +int +rte_port_in_action_profile_freeze(struct rte_port_in_action_profile *profile) +{ + if (profile->frozen) + return -EBUSY; + + action_data_offset_set(&profile->data, &profile->cfg); + profile->frozen = 1; + + return 0; +} + +int +rte_port_in_action_profile_free(struct rte_port_in_action_profile *profile) +{ + if (profile == NULL) + return 0; + + free(profile); + return 0; +} + +/** + * Action + */ +struct rte_port_in_action { + struct ap_config cfg; + struct ap_data data; + uint8_t memory[0] __rte_cache_aligned; +}; + +static __rte_always_inline void * +action_data_get(struct rte_port_in_action *action, + enum rte_port_in_action_type type) +{ + size_t offset = action->data.offset[type]; + + return &action->memory[offset]; +} + +static void +action_data_init(struct rte_port_in_action *action, + enum rte_port_in_action_type type) +{ + void *data = action_data_get(action, type); + + switch (type) { + case RTE_PORT_IN_ACTION_FLTR: + fltr_init(data, &action->cfg.fltr); + return; + + case RTE_PORT_IN_ACTION_LB: + lb_init(data, &action->cfg.lb); + return; + + default: + return; + } +} + +struct rte_port_in_action * +rte_port_in_action_create(struct rte_port_in_action_profile *profile, + uint32_t socket_id) +{ + struct rte_port_in_action *action; + size_t size; + uint32_t i; + + /* Check input arguments */ + if ((profile == NULL) || + (profile->frozen == 0)) + return NULL; + + /* Memory allocation */ + size = sizeof(struct rte_port_in_action) + profile->data.total_size; + size = RTE_CACHE_LINE_ROUNDUP(size); + + action = rte_zmalloc_socket(NULL, + size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (action == NULL) + return NULL; + + /* Initialization */ + memcpy(&action->cfg, &profile->cfg, sizeof(profile->cfg)); + memcpy(&action->data, &profile->data, sizeof(profile->data)); + + for (i = 0; i < RTE_PORT_IN_ACTION_MAX; i++) + if (action->cfg.action_mask & (1LLU << i)) + action_data_init(action, + (enum rte_port_in_action_type)i); + + return action; +} + +int +rte_port_in_action_apply(struct rte_port_in_action *action, + enum rte_port_in_action_type type, + void *action_params) +{ + void *action_data; + + /* Check input arguments */ + if ((action == NULL) || + (action_valid(type) == 0) || + ((action->cfg.action_mask & (1LLU << type)) == 0) || + (action_params == NULL)) + return -EINVAL; + + /* Data update */ + action_data = action_data_get(action, type); + + switch (type) { + case RTE_PORT_IN_ACTION_FLTR: + return fltr_apply(action_data, + action_params); + + case RTE_PORT_IN_ACTION_LB: + return lb_apply(action_data, + action_params); + + default: + return -EINVAL; + } +} + +static int +ah_filter_on_match(struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint32_t n_pkts, + void *arg) +{ + struct rte_port_in_action *action = arg; + struct rte_port_in_action_fltr_config *cfg = &action->cfg.fltr; + uint64_t *key_mask = (uint64_t *) cfg->key_mask; + uint64_t *key = (uint64_t *) cfg->key; + uint32_t key_offset = cfg->key_offset; + struct fltr_data *data = action_data_get(action, + RTE_PORT_IN_ACTION_FLTR); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + uint64_t *pkt_key = RTE_MBUF_METADATA_UINT64_PTR(pkt, + key_offset); + + uint64_t xor0 = (pkt_key[0] & key_mask[0]) ^ key[0]; + uint64_t xor1 = (pkt_key[1] & key_mask[1]) ^ key[1]; + uint64_t or = xor0 | xor1; + + if (or == 0) { + rte_pipeline_ah_packet_hijack(p, 1LLU << i); + rte_pipeline_port_out_packet_insert(p, + data->port_id, pkt); + } + } + + return 0; +} + +static int +ah_filter_on_mismatch(struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint32_t n_pkts, + void *arg) +{ + struct rte_port_in_action *action = arg; + struct rte_port_in_action_fltr_config *cfg = &action->cfg.fltr; + uint64_t *key_mask = (uint64_t *) cfg->key_mask; + uint64_t *key = (uint64_t *) cfg->key; + uint32_t key_offset = cfg->key_offset; + struct fltr_data *data = action_data_get(action, + RTE_PORT_IN_ACTION_FLTR); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + uint64_t *pkt_key = RTE_MBUF_METADATA_UINT64_PTR(pkt, + key_offset); + + uint64_t xor0 = (pkt_key[0] & key_mask[0]) ^ key[0]; + uint64_t xor1 = (pkt_key[1] & key_mask[1]) ^ key[1]; + uint64_t or = xor0 | xor1; + + if (or) { + rte_pipeline_ah_packet_hijack(p, 1LLU << i); + rte_pipeline_port_out_packet_insert(p, + data->port_id, pkt); + } + } + + return 0; +} + +static int +ah_lb(struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint32_t n_pkts, + void *arg) +{ + struct rte_port_in_action *action = arg; + struct rte_port_in_action_lb_config *cfg = &action->cfg.lb; + struct lb_data *data = action_data_get(action, RTE_PORT_IN_ACTION_LB); + uint64_t pkt_mask = RTE_LEN2MASK(n_pkts, uint64_t); + uint32_t i; + + rte_pipeline_ah_packet_hijack(p, pkt_mask); + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + uint8_t *pkt_key = RTE_MBUF_METADATA_UINT8_PTR(pkt, + cfg->key_offset); + + uint64_t digest = cfg->f_hash(pkt_key, + cfg->key_mask, + cfg->key_size, + cfg->seed); + uint64_t pos = digest & (RTE_PORT_IN_ACTION_LB_TABLE_SIZE - 1); + uint32_t port_id = data->port_id[pos]; + + rte_pipeline_port_out_packet_insert(p, port_id, pkt); + } + + return 0; +} + +static rte_pipeline_port_in_action_handler +ah_selector(struct rte_port_in_action *action) +{ + if (action->cfg.action_mask == 0) + return NULL; + + if (action->cfg.action_mask == 1LLU << RTE_PORT_IN_ACTION_FLTR) + return (action->cfg.fltr.filter_on_match) ? + ah_filter_on_match : ah_filter_on_mismatch; + + if (action->cfg.action_mask == 1LLU << RTE_PORT_IN_ACTION_LB) + return ah_lb; + + return NULL; +} + +int +rte_port_in_action_params_get(struct rte_port_in_action *action, + struct rte_pipeline_port_in_params *params) +{ + rte_pipeline_port_in_action_handler f_action; + + /* Check input arguments */ + if ((action == NULL) || + (params == NULL)) + return -EINVAL; + + f_action = ah_selector(action); + + /* Fill in params */ + params->f_action = f_action; + params->arg_ah = (f_action) ? action : NULL; + + return 0; +} + +int +rte_port_in_action_free(struct rte_port_in_action *action) +{ + if (action == NULL) + return 0; + + rte_free(action); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.h b/src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.h new file mode 100644 index 00000000..0a85e4e0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_port_in_action.h @@ -0,0 +1,301 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_IN_ACTION_H__ +#define __INCLUDE_RTE_PORT_IN_ACTION_H__ + +/** + * @file + * RTE Pipeline Input Port Actions + * + * This API provides a common set of actions for pipeline input ports to speed + * up application development. + * + * Each pipeline input port can be assigned an action handler to be executed + * on every input packet during the pipeline execution. The pipeline library + * allows the user to define his own input port actions by providing customized + * input port action handler. While the user can still follow this process, this + * API is intended to provide a quicker development alternative for a set of + * predefined actions. + * + * The typical steps to use this API are: + * - Define an input port action profile. This is a configuration template that + * can potentially be shared by multiple input ports from the same or + * different pipelines, with different input ports from the same pipeline + * able to use different action profiles. For every input port using a given + * action profile, the profile defines the set of actions and the action + * configuration to be executed by the input port. API functions: + * rte_port_in_action_profile_create(), + * rte_port_in_action_profile_action_register(), + * rte_port_in_action_profile_freeze(). + * + * - Instantiate the input port action profile to create input port action + * objects. Each pipeline input port has its own action object. + * API functions: rte_port_in_action_create(). + * + * - Use the input port action object to generate the input port action handler + * invoked by the pipeline. API functions: + * rte_port_in_action_params_get(). + * + * - Use the input port action object to generate the internal data structures + * used by the input port action handler based on given action parameters. + * API functions: rte_port_in_action_apply(). + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include + +#include "rte_pipeline.h" + +/** Input port actions. */ +enum rte_port_in_action_type { + /** Filter selected input packets. */ + RTE_PORT_IN_ACTION_FLTR = 0, + + /** Load balance. */ + RTE_PORT_IN_ACTION_LB, +}; + +/** + * RTE_PORT_IN_ACTION_FLTR + */ +/** Filter key size (number of bytes) */ +#define RTE_PORT_IN_ACTION_FLTR_KEY_SIZE 16 + +/** Filter action configuration (per action profile). */ +struct rte_port_in_action_fltr_config { + /** Key offset within the input packet buffer. Offset 0 points to the + * first byte of the MBUF structure. + */ + uint32_t key_offset; + + /** Key mask. */ + uint8_t key_mask[RTE_PORT_IN_ACTION_FLTR_KEY_SIZE]; + + /** Key value. */ + uint8_t key[RTE_PORT_IN_ACTION_FLTR_KEY_SIZE]; + + /** When non-zero, all the input packets that match the *key* (with the + * *key_mask* applied) are sent to the pipeline output port *port_id*. + * When zero, all the input packets that do NOT match the *key* (with + * *key_mask* applied) are sent to the pipeline output port *port_id*. + */ + int filter_on_match; + + /** Pipeline output port ID to send the filtered input packets to. + * Can be updated later. + * + * @see struct rte_port_in_action_fltr_params + */ + uint32_t port_id; +}; + +/** Filter action parameters (per action). */ +struct rte_port_in_action_fltr_params { + /** Pipeline output port ID to send the filtered input packets to. */ + uint32_t port_id; +}; + +/** + * RTE_PORT_IN_ACTION_LB + */ +/** Load balance key size min (number of bytes). */ +#define RTE_PORT_IN_ACTION_LB_KEY_SIZE_MIN 8 + +/** Load balance key size max (number of bytes). */ +#define RTE_PORT_IN_ACTION_LB_KEY_SIZE_MAX 64 + +/** Load balance table size. */ +#define RTE_PORT_IN_ACTION_LB_TABLE_SIZE 16 + +/** Load balance action configuration (per action profile). */ +struct rte_port_in_action_lb_config { + /** Key size (number of bytes). */ + uint32_t key_size; + + /** Key offset within the input packet buffer. Offset 0 points to the + * first byte of the MBUF structure. + */ + uint32_t key_offset; + + /** Key mask(*key_size* bytes are valid). */ + uint8_t key_mask[RTE_PORT_IN_ACTION_LB_KEY_SIZE_MAX]; + + /** Hash function. */ + rte_table_hash_op_hash f_hash; + + /** Seed value for *f_hash*. */ + uint64_t seed; + + /** Table defining the weight of each pipeline output port. The weights + * are set in 1/RTE_PORT_IN_ACTION_LB_TABLE_SIZE increments. To assign a + * weight of N/RTE_PORT_IN_ACTION_LB_TABLE_SIZE to a given output port + * (0 <= N <= RTE_PORT_IN_ACTION_LB_TABLE_SIZE), the output port needs + * to show up exactly N times in this table. Can be updated later. + * + * @see struct rte_port_in_action_lb_params + */ + uint32_t port_id[RTE_PORT_IN_ACTION_LB_TABLE_SIZE]; +}; + +/** Load balance action parameters (per action). */ +struct rte_port_in_action_lb_params { + /** Table defining the weight of each pipeline output port. The weights + * are set in 1/RTE_PORT_IN_ACTION_LB_TABLE_SIZE increments. To assign a + * weight of N/RTE_PORT_IN_ACTION_LB_TABLE_SIZE to a given output port + * (0 <= N <= RTE_PORT_IN_ACTION_LB_TABLE_SIZE), the output port needs + * to show up exactly N times in this table. + */ + uint32_t port_id[RTE_PORT_IN_ACTION_LB_TABLE_SIZE]; +}; + +/** + * Input port action profile. + */ +struct rte_port_in_action_profile; + +/** + * Input port action profile create. + * + * @param[in] socket_id + * CPU socket ID for the internal data structures memory allocation. + * @return + * Input port action profile handle on success, NULL otherwise. + */ +struct rte_port_in_action_profile * __rte_experimental +rte_port_in_action_profile_create(uint32_t socket_id); + +/** + * Input port action profile free. + * + * @param[in] profile + * Input port action profile handle (needs to be valid). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_port_in_action_profile_free(struct rte_port_in_action_profile *profile); + +/** + * Input port action profile action register. + * + * @param[in] profile + * Input port action profile handle (needs to be valid and not in frozen + * state). + * @param[in] type + * Specific input port action to be registered for *profile*. + * @param[in] action_config + * Configuration for the *type* action. + * If struct rte_port_in_action_*type*_config is defined, it needs to point to + * a valid instance of this structure, otherwise it needs to be set to NULL. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_port_in_action_profile_action_register( + struct rte_port_in_action_profile *profile, + enum rte_port_in_action_type type, + void *action_config); + +/** + * Input port action profile freeze. + * + * Once this function is called successfully, the given profile enters the + * frozen state with the following immediate effects: no more actions can be + * registered for this profile, so the profile can be instantiated to create + * input port action objects. + * + * @param[in] profile + * Input port profile action handle (needs to be valid and not in frozen + * state). + * @return + * Zero on success, non-zero error code otherwise. + * + * @see rte_port_in_action_create() + */ +int __rte_experimental +rte_port_in_action_profile_freeze(struct rte_port_in_action_profile *profile); + +/** + * Input port action. + */ +struct rte_port_in_action; + +/** + * Input port action create. + * + * Instantiates the given input port action profile to create an input port + * action object. + * + * @param[in] profile + * Input port profile action handle (needs to be valid and in frozen state). + * @param[in] socket_id + * CPU socket ID where the internal data structures required by the new input + * port action object should be allocated. + * @return + * Handle to input port action object on success, NULL on error. + */ +struct rte_port_in_action * __rte_experimental +rte_port_in_action_create(struct rte_port_in_action_profile *profile, + uint32_t socket_id); + +/** + * Input port action free. + * + * @param[in] action + * Handle to input port action object (needs to be valid). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_port_in_action_free(struct rte_port_in_action *action); + +/** + * Input port params get. + * + * @param[in] action + * Handle to input port action object (needs to be valid). + * @param[inout] params + * Pipeline input port parameters (needs to be pre-allocated). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_port_in_action_params_get(struct rte_port_in_action *action, + struct rte_pipeline_port_in_params *params); + +/** + * Input port action apply. + * + * @param[in] action + * Handle to input port action object (needs to be valid). + * @param[in] type + * Specific input port action previously registered for the input port action + * profile of the *action* object. + * @param[in] action_params + * Parameters for the *type* action. + * If struct rte_port_in_action_*type*_params is defined, it needs to point to + * a valid instance of this structure, otherwise it needs to be set to NULL. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_port_in_action_apply(struct rte_port_in_action *action, + enum rte_port_in_action_type type, + void *action_params); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_PORT_IN_ACTION_H__ */ diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_table_action.c b/src/spdk/dpdk/lib/librte_pipeline/rte_table_action.c new file mode 100644 index 00000000..83ffa5de --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_table_action.c @@ -0,0 +1,2386 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_table_action.h" + +#define rte_htons rte_cpu_to_be_16 +#define rte_htonl rte_cpu_to_be_32 + +#define rte_ntohs rte_be_to_cpu_16 +#define rte_ntohl rte_be_to_cpu_32 + +/** + * RTE_TABLE_ACTION_FWD + */ +#define fwd_data rte_pipeline_table_entry + +static int +fwd_apply(struct fwd_data *data, + struct rte_table_action_fwd_params *p) +{ + data->action = p->action; + + if (p->action == RTE_PIPELINE_ACTION_PORT) + data->port_id = p->id; + + if (p->action == RTE_PIPELINE_ACTION_TABLE) + data->table_id = p->id; + + return 0; +} + +/** + * RTE_TABLE_ACTION_LB + */ +static int +lb_cfg_check(struct rte_table_action_lb_config *cfg) +{ + if ((cfg == NULL) || + (cfg->key_size < RTE_TABLE_ACTION_LB_KEY_SIZE_MIN) || + (cfg->key_size > RTE_TABLE_ACTION_LB_KEY_SIZE_MAX) || + (!rte_is_power_of_2(cfg->key_size)) || + (cfg->f_hash == NULL)) + return -1; + + return 0; +} + +struct lb_data { + uint32_t out[RTE_TABLE_ACTION_LB_TABLE_SIZE]; +} __attribute__((__packed__)); + +static int +lb_apply(struct lb_data *data, + struct rte_table_action_lb_params *p) +{ + memcpy(data->out, p->out, sizeof(data->out)); + + return 0; +} + +static __rte_always_inline void +pkt_work_lb(struct rte_mbuf *mbuf, + struct lb_data *data, + struct rte_table_action_lb_config *cfg) +{ + uint8_t *pkt_key = RTE_MBUF_METADATA_UINT8_PTR(mbuf, cfg->key_offset); + uint32_t *out = RTE_MBUF_METADATA_UINT32_PTR(mbuf, cfg->out_offset); + uint64_t digest, pos; + uint32_t out_val; + + digest = cfg->f_hash(pkt_key, + cfg->key_mask, + cfg->key_size, + cfg->seed); + pos = digest & (RTE_TABLE_ACTION_LB_TABLE_SIZE - 1); + out_val = data->out[pos]; + + *out = out_val; +} + +/** + * RTE_TABLE_ACTION_MTR + */ +static int +mtr_cfg_check(struct rte_table_action_mtr_config *mtr) +{ + if ((mtr->alg == RTE_TABLE_ACTION_METER_SRTCM) || + ((mtr->n_tc != 1) && (mtr->n_tc != 4)) || + (mtr->n_bytes_enabled != 0)) + return -ENOTSUP; + return 0; +} + +#define MBUF_SCHED_QUEUE_TC_COLOR(queue, tc, color) \ + ((uint16_t)((((uint64_t)(queue)) & 0x3) | \ + ((((uint64_t)(tc)) & 0x3) << 2) | \ + ((((uint64_t)(color)) & 0x3) << 4))) + +#define MBUF_SCHED_COLOR(sched, color) \ + (((sched) & (~0x30LLU)) | ((color) << 4)) + +struct mtr_trtcm_data { + struct rte_meter_trtcm trtcm; + uint64_t stats[e_RTE_METER_COLORS]; +} __attribute__((__packed__)); + +#define MTR_TRTCM_DATA_METER_PROFILE_ID_GET(data) \ + (((data)->stats[e_RTE_METER_GREEN] & 0xF8LLU) >> 3) + +static void +mtr_trtcm_data_meter_profile_id_set(struct mtr_trtcm_data *data, + uint32_t profile_id) +{ + data->stats[e_RTE_METER_GREEN] &= ~0xF8LLU; + data->stats[e_RTE_METER_GREEN] |= (profile_id % 32) << 3; +} + +#define MTR_TRTCM_DATA_POLICER_ACTION_DROP_GET(data, color)\ + (((data)->stats[(color)] & 4LLU) >> 2) + +#define MTR_TRTCM_DATA_POLICER_ACTION_COLOR_GET(data, color)\ + ((enum rte_meter_color)((data)->stats[(color)] & 3LLU)) + +static void +mtr_trtcm_data_policer_action_set(struct mtr_trtcm_data *data, + enum rte_meter_color color, + enum rte_table_action_policer action) +{ + if (action == RTE_TABLE_ACTION_POLICER_DROP) { + data->stats[color] |= 4LLU; + } else { + data->stats[color] &= ~7LLU; + data->stats[color] |= color & 3LLU; + } +} + +static uint64_t +mtr_trtcm_data_stats_get(struct mtr_trtcm_data *data, + enum rte_meter_color color) +{ + return data->stats[color] >> 8; +} + +static void +mtr_trtcm_data_stats_reset(struct mtr_trtcm_data *data, + enum rte_meter_color color) +{ + data->stats[color] &= 0xFFLU; +} + +#define MTR_TRTCM_DATA_STATS_INC(data, color) \ + ((data)->stats[(color)] += (1LLU << 8)) + +static size_t +mtr_data_size(struct rte_table_action_mtr_config *mtr) +{ + return mtr->n_tc * sizeof(struct mtr_trtcm_data); +} + +struct dscp_table_entry_data { + enum rte_meter_color color; + uint16_t tc; + uint16_t queue_tc_color; +}; + +struct dscp_table_data { + struct dscp_table_entry_data entry[64]; +}; + +struct meter_profile_data { + struct rte_meter_trtcm_profile profile; + uint32_t profile_id; + int valid; +}; + +static struct meter_profile_data * +meter_profile_data_find(struct meter_profile_data *mp, + uint32_t mp_size, + uint32_t profile_id) +{ + uint32_t i; + + for (i = 0; i < mp_size; i++) { + struct meter_profile_data *mp_data = &mp[i]; + + if (mp_data->valid && (mp_data->profile_id == profile_id)) + return mp_data; + } + + return NULL; +} + +static struct meter_profile_data * +meter_profile_data_find_unused(struct meter_profile_data *mp, + uint32_t mp_size) +{ + uint32_t i; + + for (i = 0; i < mp_size; i++) { + struct meter_profile_data *mp_data = &mp[i]; + + if (!mp_data->valid) + return mp_data; + } + + return NULL; +} + +static int +mtr_apply_check(struct rte_table_action_mtr_params *p, + struct rte_table_action_mtr_config *cfg, + struct meter_profile_data *mp, + uint32_t mp_size) +{ + uint32_t i; + + if (p->tc_mask > RTE_LEN2MASK(cfg->n_tc, uint32_t)) + return -EINVAL; + + for (i = 0; i < RTE_TABLE_ACTION_TC_MAX; i++) { + struct rte_table_action_mtr_tc_params *p_tc = &p->mtr[i]; + struct meter_profile_data *mp_data; + + if ((p->tc_mask & (1LLU << i)) == 0) + continue; + + mp_data = meter_profile_data_find(mp, + mp_size, + p_tc->meter_profile_id); + if (!mp_data) + return -EINVAL; + } + + return 0; +} + +static int +mtr_apply(struct mtr_trtcm_data *data, + struct rte_table_action_mtr_params *p, + struct rte_table_action_mtr_config *cfg, + struct meter_profile_data *mp, + uint32_t mp_size) +{ + uint32_t i; + int status; + + /* Check input arguments */ + status = mtr_apply_check(p, cfg, mp, mp_size); + if (status) + return status; + + /* Apply */ + for (i = 0; i < RTE_TABLE_ACTION_TC_MAX; i++) { + struct rte_table_action_mtr_tc_params *p_tc = &p->mtr[i]; + struct mtr_trtcm_data *data_tc = &data[i]; + struct meter_profile_data *mp_data; + + if ((p->tc_mask & (1LLU << i)) == 0) + continue; + + /* Find profile */ + mp_data = meter_profile_data_find(mp, + mp_size, + p_tc->meter_profile_id); + if (!mp_data) + return -EINVAL; + + memset(data_tc, 0, sizeof(*data_tc)); + + /* Meter object */ + status = rte_meter_trtcm_config(&data_tc->trtcm, + &mp_data->profile); + if (status) + return status; + + /* Meter profile */ + mtr_trtcm_data_meter_profile_id_set(data_tc, + mp_data - mp); + + /* Policer actions */ + mtr_trtcm_data_policer_action_set(data_tc, + e_RTE_METER_GREEN, + p_tc->policer[e_RTE_METER_GREEN]); + + mtr_trtcm_data_policer_action_set(data_tc, + e_RTE_METER_YELLOW, + p_tc->policer[e_RTE_METER_YELLOW]); + + mtr_trtcm_data_policer_action_set(data_tc, + e_RTE_METER_RED, + p_tc->policer[e_RTE_METER_RED]); + } + + return 0; +} + +static __rte_always_inline uint64_t +pkt_work_mtr(struct rte_mbuf *mbuf, + struct mtr_trtcm_data *data, + struct dscp_table_data *dscp_table, + struct meter_profile_data *mp, + uint64_t time, + uint32_t dscp, + uint16_t total_length) +{ + uint64_t drop_mask, sched; + uint64_t *sched_ptr = (uint64_t *) &mbuf->hash.sched; + struct dscp_table_entry_data *dscp_entry = &dscp_table->entry[dscp]; + enum rte_meter_color color_in, color_meter, color_policer; + uint32_t tc, mp_id; + + tc = dscp_entry->tc; + color_in = dscp_entry->color; + data += tc; + mp_id = MTR_TRTCM_DATA_METER_PROFILE_ID_GET(data); + sched = *sched_ptr; + + /* Meter */ + color_meter = rte_meter_trtcm_color_aware_check( + &data->trtcm, + &mp[mp_id].profile, + time, + total_length, + color_in); + + /* Stats */ + MTR_TRTCM_DATA_STATS_INC(data, color_meter); + + /* Police */ + drop_mask = MTR_TRTCM_DATA_POLICER_ACTION_DROP_GET(data, color_meter); + color_policer = + MTR_TRTCM_DATA_POLICER_ACTION_COLOR_GET(data, color_meter); + *sched_ptr = MBUF_SCHED_COLOR(sched, color_policer); + + return drop_mask; +} + +/** + * RTE_TABLE_ACTION_TM + */ +static int +tm_cfg_check(struct rte_table_action_tm_config *tm) +{ + if ((tm->n_subports_per_port == 0) || + (rte_is_power_of_2(tm->n_subports_per_port) == 0) || + (tm->n_subports_per_port > UINT16_MAX) || + (tm->n_pipes_per_subport == 0) || + (rte_is_power_of_2(tm->n_pipes_per_subport) == 0)) + return -ENOTSUP; + + return 0; +} + +struct tm_data { + uint16_t queue_tc_color; + uint16_t subport; + uint32_t pipe; +} __attribute__((__packed__)); + +static int +tm_apply_check(struct rte_table_action_tm_params *p, + struct rte_table_action_tm_config *cfg) +{ + if ((p->subport_id >= cfg->n_subports_per_port) || + (p->pipe_id >= cfg->n_pipes_per_subport)) + return -EINVAL; + + return 0; +} + +static int +tm_apply(struct tm_data *data, + struct rte_table_action_tm_params *p, + struct rte_table_action_tm_config *cfg) +{ + int status; + + /* Check input arguments */ + status = tm_apply_check(p, cfg); + if (status) + return status; + + /* Apply */ + data->queue_tc_color = 0; + data->subport = (uint16_t) p->subport_id; + data->pipe = p->pipe_id; + + return 0; +} + +static __rte_always_inline void +pkt_work_tm(struct rte_mbuf *mbuf, + struct tm_data *data, + struct dscp_table_data *dscp_table, + uint32_t dscp) +{ + struct dscp_table_entry_data *dscp_entry = &dscp_table->entry[dscp]; + struct tm_data *sched_ptr = (struct tm_data *) &mbuf->hash.sched; + struct tm_data sched; + + sched = *data; + sched.queue_tc_color = dscp_entry->queue_tc_color; + *sched_ptr = sched; +} + +/** + * RTE_TABLE_ACTION_ENCAP + */ +static int +encap_valid(enum rte_table_action_encap_type encap) +{ + switch (encap) { + case RTE_TABLE_ACTION_ENCAP_ETHER: + case RTE_TABLE_ACTION_ENCAP_VLAN: + case RTE_TABLE_ACTION_ENCAP_QINQ: + case RTE_TABLE_ACTION_ENCAP_MPLS: + case RTE_TABLE_ACTION_ENCAP_PPPOE: + return 1; + default: + return 0; + } +} + +static int +encap_cfg_check(struct rte_table_action_encap_config *encap) +{ + if ((encap->encap_mask == 0) || + (__builtin_popcountll(encap->encap_mask) != 1)) + return -ENOTSUP; + + return 0; +} + +struct encap_ether_data { + struct ether_hdr ether; +} __attribute__((__packed__)); + +#define VLAN(pcp, dei, vid) \ + ((uint16_t)((((uint64_t)(pcp)) & 0x7LLU) << 13) | \ + ((((uint64_t)(dei)) & 0x1LLU) << 12) | \ + (((uint64_t)(vid)) & 0xFFFLLU)) \ + +struct encap_vlan_data { + struct ether_hdr ether; + struct vlan_hdr vlan; +} __attribute__((__packed__)); + +struct encap_qinq_data { + struct ether_hdr ether; + struct vlan_hdr svlan; + struct vlan_hdr cvlan; +} __attribute__((__packed__)); + +#define ETHER_TYPE_MPLS_UNICAST 0x8847 + +#define ETHER_TYPE_MPLS_MULTICAST 0x8848 + +#define MPLS(label, tc, s, ttl) \ + ((uint32_t)(((((uint64_t)(label)) & 0xFFFFFLLU) << 12) |\ + ((((uint64_t)(tc)) & 0x7LLU) << 9) | \ + ((((uint64_t)(s)) & 0x1LLU) << 8) | \ + (((uint64_t)(ttl)) & 0xFFLLU))) + +struct encap_mpls_data { + struct ether_hdr ether; + uint32_t mpls[RTE_TABLE_ACTION_MPLS_LABELS_MAX]; + uint32_t mpls_count; +} __attribute__((__packed__)); + +#define ETHER_TYPE_PPPOE_SESSION 0x8864 + +#define PPP_PROTOCOL_IP 0x0021 + +struct pppoe_ppp_hdr { + uint16_t ver_type_code; + uint16_t session_id; + uint16_t length; + uint16_t protocol; +} __attribute__((__packed__)); + +struct encap_pppoe_data { + struct ether_hdr ether; + struct pppoe_ppp_hdr pppoe_ppp; +} __attribute__((__packed__)); + +static size_t +encap_data_size(struct rte_table_action_encap_config *encap) +{ + switch (encap->encap_mask) { + case 1LLU << RTE_TABLE_ACTION_ENCAP_ETHER: + return sizeof(struct encap_ether_data); + + case 1LLU << RTE_TABLE_ACTION_ENCAP_VLAN: + return sizeof(struct encap_vlan_data); + + case 1LLU << RTE_TABLE_ACTION_ENCAP_QINQ: + return sizeof(struct encap_qinq_data); + + case 1LLU << RTE_TABLE_ACTION_ENCAP_MPLS: + return sizeof(struct encap_mpls_data); + + case 1LLU << RTE_TABLE_ACTION_ENCAP_PPPOE: + return sizeof(struct encap_pppoe_data); + + default: + return 0; + } +} + +static int +encap_apply_check(struct rte_table_action_encap_params *p, + struct rte_table_action_encap_config *cfg) +{ + if ((encap_valid(p->type) == 0) || + ((cfg->encap_mask & (1LLU << p->type)) == 0)) + return -EINVAL; + + switch (p->type) { + case RTE_TABLE_ACTION_ENCAP_ETHER: + return 0; + + case RTE_TABLE_ACTION_ENCAP_VLAN: + return 0; + + case RTE_TABLE_ACTION_ENCAP_QINQ: + return 0; + + case RTE_TABLE_ACTION_ENCAP_MPLS: + if ((p->mpls.mpls_count == 0) || + (p->mpls.mpls_count > RTE_TABLE_ACTION_MPLS_LABELS_MAX)) + return -EINVAL; + + return 0; + + case RTE_TABLE_ACTION_ENCAP_PPPOE: + return 0; + + default: + return -EINVAL; + } +} + +static int +encap_ether_apply(void *data, + struct rte_table_action_encap_params *p, + struct rte_table_action_common_config *common_cfg) +{ + struct encap_ether_data *d = data; + uint16_t ethertype = (common_cfg->ip_version) ? + ETHER_TYPE_IPv4 : + ETHER_TYPE_IPv6; + + /* Ethernet */ + ether_addr_copy(&p->ether.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->ether.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ethertype); + + return 0; +} + +static int +encap_vlan_apply(void *data, + struct rte_table_action_encap_params *p, + struct rte_table_action_common_config *common_cfg) +{ + struct encap_vlan_data *d = data; + uint16_t ethertype = (common_cfg->ip_version) ? + ETHER_TYPE_IPv4 : + ETHER_TYPE_IPv6; + + /* Ethernet */ + ether_addr_copy(&p->vlan.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->vlan.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_VLAN); + + /* VLAN */ + d->vlan.vlan_tci = rte_htons(VLAN(p->vlan.vlan.pcp, + p->vlan.vlan.dei, + p->vlan.vlan.vid)); + d->vlan.eth_proto = rte_htons(ethertype); + + return 0; +} + +static int +encap_qinq_apply(void *data, + struct rte_table_action_encap_params *p, + struct rte_table_action_common_config *common_cfg) +{ + struct encap_qinq_data *d = data; + uint16_t ethertype = (common_cfg->ip_version) ? + ETHER_TYPE_IPv4 : + ETHER_TYPE_IPv6; + + /* Ethernet */ + ether_addr_copy(&p->qinq.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->qinq.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_QINQ); + + /* SVLAN */ + d->svlan.vlan_tci = rte_htons(VLAN(p->qinq.svlan.pcp, + p->qinq.svlan.dei, + p->qinq.svlan.vid)); + d->svlan.eth_proto = rte_htons(ETHER_TYPE_VLAN); + + /* CVLAN */ + d->cvlan.vlan_tci = rte_htons(VLAN(p->qinq.cvlan.pcp, + p->qinq.cvlan.dei, + p->qinq.cvlan.vid)); + d->cvlan.eth_proto = rte_htons(ethertype); + + return 0; +} + +static int +encap_mpls_apply(void *data, + struct rte_table_action_encap_params *p) +{ + struct encap_mpls_data *d = data; + uint16_t ethertype = (p->mpls.unicast) ? + ETHER_TYPE_MPLS_UNICAST : + ETHER_TYPE_MPLS_MULTICAST; + uint32_t i; + + /* Ethernet */ + ether_addr_copy(&p->mpls.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->mpls.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ethertype); + + /* MPLS */ + for (i = 0; i < p->mpls.mpls_count - 1; i++) + d->mpls[i] = rte_htonl(MPLS(p->mpls.mpls[i].label, + p->mpls.mpls[i].tc, + 0, + p->mpls.mpls[i].ttl)); + + d->mpls[i] = rte_htonl(MPLS(p->mpls.mpls[i].label, + p->mpls.mpls[i].tc, + 1, + p->mpls.mpls[i].ttl)); + + d->mpls_count = p->mpls.mpls_count; + return 0; +} + +static int +encap_pppoe_apply(void *data, + struct rte_table_action_encap_params *p) +{ + struct encap_pppoe_data *d = data; + + /* Ethernet */ + ether_addr_copy(&p->pppoe.ether.da, &d->ether.d_addr); + ether_addr_copy(&p->pppoe.ether.sa, &d->ether.s_addr); + d->ether.ether_type = rte_htons(ETHER_TYPE_PPPOE_SESSION); + + /* PPPoE and PPP*/ + d->pppoe_ppp.ver_type_code = rte_htons(0x1100); + d->pppoe_ppp.session_id = rte_htons(p->pppoe.pppoe.session_id); + d->pppoe_ppp.length = 0; /* not pre-computed */ + d->pppoe_ppp.protocol = rte_htons(PPP_PROTOCOL_IP); + + return 0; +} + +static int +encap_apply(void *data, + struct rte_table_action_encap_params *p, + struct rte_table_action_encap_config *cfg, + struct rte_table_action_common_config *common_cfg) +{ + int status; + + /* Check input arguments */ + status = encap_apply_check(p, cfg); + if (status) + return status; + + switch (p->type) { + case RTE_TABLE_ACTION_ENCAP_ETHER: + return encap_ether_apply(data, p, common_cfg); + + case RTE_TABLE_ACTION_ENCAP_VLAN: + return encap_vlan_apply(data, p, common_cfg); + + case RTE_TABLE_ACTION_ENCAP_QINQ: + return encap_qinq_apply(data, p, common_cfg); + + case RTE_TABLE_ACTION_ENCAP_MPLS: + return encap_mpls_apply(data, p); + + case RTE_TABLE_ACTION_ENCAP_PPPOE: + return encap_pppoe_apply(data, p); + + default: + return -EINVAL; + } +} + +static __rte_always_inline void * +encap(void *dst, const void *src, size_t n) +{ + dst = ((uint8_t *) dst) - n; + return rte_memcpy(dst, src, n); +} + +static __rte_always_inline void +pkt_work_encap(struct rte_mbuf *mbuf, + void *data, + struct rte_table_action_encap_config *cfg, + void *ip, + uint16_t total_length, + uint32_t ip_offset) +{ + switch (cfg->encap_mask) { + case 1LLU << RTE_TABLE_ACTION_ENCAP_ETHER: + encap(ip, data, sizeof(struct encap_ether_data)); + mbuf->data_off = ip_offset - (sizeof(struct rte_mbuf) + + sizeof(struct encap_ether_data)); + mbuf->pkt_len = mbuf->data_len = total_length + + sizeof(struct encap_ether_data); + break; + + case 1LLU << RTE_TABLE_ACTION_ENCAP_VLAN: + encap(ip, data, sizeof(struct encap_vlan_data)); + mbuf->data_off = ip_offset - (sizeof(struct rte_mbuf) + + sizeof(struct encap_vlan_data)); + mbuf->pkt_len = mbuf->data_len = total_length + + sizeof(struct encap_vlan_data); + break; + + case 1LLU << RTE_TABLE_ACTION_ENCAP_QINQ: + encap(ip, data, sizeof(struct encap_qinq_data)); + mbuf->data_off = ip_offset - (sizeof(struct rte_mbuf) + + sizeof(struct encap_qinq_data)); + mbuf->pkt_len = mbuf->data_len = total_length + + sizeof(struct encap_qinq_data); + break; + + case 1LLU << RTE_TABLE_ACTION_ENCAP_MPLS: + { + struct encap_mpls_data *mpls = data; + size_t size = sizeof(struct ether_hdr) + + mpls->mpls_count * 4; + + encap(ip, data, size); + mbuf->data_off = ip_offset - (sizeof(struct rte_mbuf) + size); + mbuf->pkt_len = mbuf->data_len = total_length + size; + break; + } + + case 1LLU << RTE_TABLE_ACTION_ENCAP_PPPOE: + { + struct encap_pppoe_data *pppoe = + encap(ip, data, sizeof(struct encap_pppoe_data)); + pppoe->pppoe_ppp.length = rte_htons(total_length + 2); + mbuf->data_off = ip_offset - (sizeof(struct rte_mbuf) + + sizeof(struct encap_pppoe_data)); + mbuf->pkt_len = mbuf->data_len = total_length + + sizeof(struct encap_pppoe_data); + break; + } + + default: + break; + } +} + +/** + * RTE_TABLE_ACTION_NAT + */ +static int +nat_cfg_check(struct rte_table_action_nat_config *nat) +{ + if ((nat->proto != 0x06) && + (nat->proto != 0x11)) + return -ENOTSUP; + + return 0; +} + +struct nat_ipv4_data { + uint32_t addr; + uint16_t port; +} __attribute__((__packed__)); + +struct nat_ipv6_data { + uint8_t addr[16]; + uint16_t port; +} __attribute__((__packed__)); + +static size_t +nat_data_size(struct rte_table_action_nat_config *nat __rte_unused, + struct rte_table_action_common_config *common) +{ + int ip_version = common->ip_version; + + return (ip_version) ? + sizeof(struct nat_ipv4_data) : + sizeof(struct nat_ipv6_data); +} + +static int +nat_apply_check(struct rte_table_action_nat_params *p, + struct rte_table_action_common_config *cfg) +{ + if ((p->ip_version && (cfg->ip_version == 0)) || + ((p->ip_version == 0) && cfg->ip_version)) + return -EINVAL; + + return 0; +} + +static int +nat_apply(void *data, + struct rte_table_action_nat_params *p, + struct rte_table_action_common_config *cfg) +{ + int status; + + /* Check input arguments */ + status = nat_apply_check(p, cfg); + if (status) + return status; + + /* Apply */ + if (p->ip_version) { + struct nat_ipv4_data *d = data; + + d->addr = rte_htonl(p->addr.ipv4); + d->port = rte_htons(p->port); + } else { + struct nat_ipv6_data *d = data; + + memcpy(d->addr, p->addr.ipv6, sizeof(d->addr)); + d->port = rte_htons(p->port); + } + + return 0; +} + +static __rte_always_inline uint16_t +nat_ipv4_checksum_update(uint16_t cksum0, + uint32_t ip0, + uint32_t ip1) +{ + int32_t cksum1; + + cksum1 = cksum0; + cksum1 = ~cksum1 & 0xFFFF; + + /* Subtract ip0 (one's complement logic) */ + cksum1 -= (ip0 >> 16) + (ip0 & 0xFFFF); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + /* Add ip1 (one's complement logic) */ + cksum1 += (ip1 >> 16) + (ip1 & 0xFFFF); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + return (uint16_t)(~cksum1); +} + +static __rte_always_inline uint16_t +nat_ipv4_tcp_udp_checksum_update(uint16_t cksum0, + uint32_t ip0, + uint32_t ip1, + uint16_t port0, + uint16_t port1) +{ + int32_t cksum1; + + cksum1 = cksum0; + cksum1 = ~cksum1 & 0xFFFF; + + /* Subtract ip0 and port 0 (one's complement logic) */ + cksum1 -= (ip0 >> 16) + (ip0 & 0xFFFF) + port0; + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + /* Add ip1 and port1 (one's complement logic) */ + cksum1 += (ip1 >> 16) + (ip1 & 0xFFFF) + port1; + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + return (uint16_t)(~cksum1); +} + +static __rte_always_inline uint16_t +nat_ipv6_tcp_udp_checksum_update(uint16_t cksum0, + uint16_t *ip0, + uint16_t *ip1, + uint16_t port0, + uint16_t port1) +{ + int32_t cksum1; + + cksum1 = cksum0; + cksum1 = ~cksum1 & 0xFFFF; + + /* Subtract ip0 and port 0 (one's complement logic) */ + cksum1 -= ip0[0] + ip0[1] + ip0[2] + ip0[3] + + ip0[4] + ip0[5] + ip0[6] + ip0[7] + port0; + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + /* Add ip1 and port1 (one's complement logic) */ + cksum1 += ip1[0] + ip1[1] + ip1[2] + ip1[3] + + ip1[4] + ip1[5] + ip1[6] + ip1[7] + port1; + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + cksum1 = (cksum1 & 0xFFFF) + (cksum1 >> 16); + + return (uint16_t)(~cksum1); +} + +static __rte_always_inline void +pkt_ipv4_work_nat(struct ipv4_hdr *ip, + struct nat_ipv4_data *data, + struct rte_table_action_nat_config *cfg) +{ + if (cfg->source_nat) { + if (cfg->proto == 0x6) { + struct tcp_hdr *tcp = (struct tcp_hdr *) &ip[1]; + uint16_t ip_cksum, tcp_cksum; + + ip_cksum = nat_ipv4_checksum_update(ip->hdr_checksum, + ip->src_addr, + data->addr); + + tcp_cksum = nat_ipv4_tcp_udp_checksum_update(tcp->cksum, + ip->src_addr, + data->addr, + tcp->src_port, + data->port); + + ip->src_addr = data->addr; + ip->hdr_checksum = ip_cksum; + tcp->src_port = data->port; + tcp->cksum = tcp_cksum; + } else { + struct udp_hdr *udp = (struct udp_hdr *) &ip[1]; + uint16_t ip_cksum, udp_cksum; + + ip_cksum = nat_ipv4_checksum_update(ip->hdr_checksum, + ip->src_addr, + data->addr); + + udp_cksum = nat_ipv4_tcp_udp_checksum_update(udp->dgram_cksum, + ip->src_addr, + data->addr, + udp->src_port, + data->port); + + ip->src_addr = data->addr; + ip->hdr_checksum = ip_cksum; + udp->src_port = data->port; + if (udp->dgram_cksum) + udp->dgram_cksum = udp_cksum; + } + } else { + if (cfg->proto == 0x6) { + struct tcp_hdr *tcp = (struct tcp_hdr *) &ip[1]; + uint16_t ip_cksum, tcp_cksum; + + ip_cksum = nat_ipv4_checksum_update(ip->hdr_checksum, + ip->dst_addr, + data->addr); + + tcp_cksum = nat_ipv4_tcp_udp_checksum_update(tcp->cksum, + ip->dst_addr, + data->addr, + tcp->dst_port, + data->port); + + ip->dst_addr = data->addr; + ip->hdr_checksum = ip_cksum; + tcp->dst_port = data->port; + tcp->cksum = tcp_cksum; + } else { + struct udp_hdr *udp = (struct udp_hdr *) &ip[1]; + uint16_t ip_cksum, udp_cksum; + + ip_cksum = nat_ipv4_checksum_update(ip->hdr_checksum, + ip->dst_addr, + data->addr); + + udp_cksum = nat_ipv4_tcp_udp_checksum_update(udp->dgram_cksum, + ip->dst_addr, + data->addr, + udp->dst_port, + data->port); + + ip->dst_addr = data->addr; + ip->hdr_checksum = ip_cksum; + udp->dst_port = data->port; + if (udp->dgram_cksum) + udp->dgram_cksum = udp_cksum; + } + } +} + +static __rte_always_inline void +pkt_ipv6_work_nat(struct ipv6_hdr *ip, + struct nat_ipv6_data *data, + struct rte_table_action_nat_config *cfg) +{ + if (cfg->source_nat) { + if (cfg->proto == 0x6) { + struct tcp_hdr *tcp = (struct tcp_hdr *) &ip[1]; + uint16_t tcp_cksum; + + tcp_cksum = nat_ipv6_tcp_udp_checksum_update(tcp->cksum, + (uint16_t *)ip->src_addr, + (uint16_t *)data->addr, + tcp->src_port, + data->port); + + rte_memcpy(ip->src_addr, data->addr, 16); + tcp->src_port = data->port; + tcp->cksum = tcp_cksum; + } else { + struct udp_hdr *udp = (struct udp_hdr *) &ip[1]; + uint16_t udp_cksum; + + udp_cksum = nat_ipv6_tcp_udp_checksum_update(udp->dgram_cksum, + (uint16_t *)ip->src_addr, + (uint16_t *)data->addr, + udp->src_port, + data->port); + + rte_memcpy(ip->src_addr, data->addr, 16); + udp->src_port = data->port; + udp->dgram_cksum = udp_cksum; + } + } else { + if (cfg->proto == 0x6) { + struct tcp_hdr *tcp = (struct tcp_hdr *) &ip[1]; + uint16_t tcp_cksum; + + tcp_cksum = nat_ipv6_tcp_udp_checksum_update(tcp->cksum, + (uint16_t *)ip->dst_addr, + (uint16_t *)data->addr, + tcp->dst_port, + data->port); + + rte_memcpy(ip->dst_addr, data->addr, 16); + tcp->dst_port = data->port; + tcp->cksum = tcp_cksum; + } else { + struct udp_hdr *udp = (struct udp_hdr *) &ip[1]; + uint16_t udp_cksum; + + udp_cksum = nat_ipv6_tcp_udp_checksum_update(udp->dgram_cksum, + (uint16_t *)ip->dst_addr, + (uint16_t *)data->addr, + udp->dst_port, + data->port); + + rte_memcpy(ip->dst_addr, data->addr, 16); + udp->dst_port = data->port; + udp->dgram_cksum = udp_cksum; + } + } +} + +/** + * RTE_TABLE_ACTION_TTL + */ +static int +ttl_cfg_check(struct rte_table_action_ttl_config *ttl) +{ + if (ttl->drop == 0) + return -ENOTSUP; + + return 0; +} + +struct ttl_data { + uint32_t n_packets; +} __attribute__((__packed__)); + +#define TTL_INIT(data, decrement) \ + ((data)->n_packets = (decrement) ? 1 : 0) + +#define TTL_DEC_GET(data) \ + ((uint8_t)((data)->n_packets & 1)) + +#define TTL_STATS_RESET(data) \ + ((data)->n_packets = ((data)->n_packets & 1)) + +#define TTL_STATS_READ(data) \ + ((data)->n_packets >> 1) + +#define TTL_STATS_ADD(data, value) \ + ((data)->n_packets = \ + (((((data)->n_packets >> 1) + (value)) << 1) | \ + ((data)->n_packets & 1))) + +static int +ttl_apply(void *data, + struct rte_table_action_ttl_params *p) +{ + struct ttl_data *d = data; + + TTL_INIT(d, p->decrement); + + return 0; +} + +static __rte_always_inline uint64_t +pkt_ipv4_work_ttl(struct ipv4_hdr *ip, + struct ttl_data *data) +{ + uint32_t drop; + uint16_t cksum = ip->hdr_checksum; + uint8_t ttl = ip->time_to_live; + uint8_t ttl_diff = TTL_DEC_GET(data); + + cksum += ttl_diff; + ttl -= ttl_diff; + + ip->hdr_checksum = cksum; + ip->time_to_live = ttl; + + drop = (ttl == 0) ? 1 : 0; + TTL_STATS_ADD(data, drop); + + return drop; +} + +static __rte_always_inline uint64_t +pkt_ipv6_work_ttl(struct ipv6_hdr *ip, + struct ttl_data *data) +{ + uint32_t drop; + uint8_t ttl = ip->hop_limits; + uint8_t ttl_diff = TTL_DEC_GET(data); + + ttl -= ttl_diff; + + ip->hop_limits = ttl; + + drop = (ttl == 0) ? 1 : 0; + TTL_STATS_ADD(data, drop); + + return drop; +} + +/** + * RTE_TABLE_ACTION_STATS + */ +static int +stats_cfg_check(struct rte_table_action_stats_config *stats) +{ + if ((stats->n_packets_enabled == 0) && (stats->n_bytes_enabled == 0)) + return -EINVAL; + + return 0; +} + +struct stats_data { + uint64_t n_packets; + uint64_t n_bytes; +} __attribute__((__packed__)); + +static int +stats_apply(struct stats_data *data, + struct rte_table_action_stats_params *p) +{ + data->n_packets = p->n_packets; + data->n_bytes = p->n_bytes; + + return 0; +} + +static __rte_always_inline void +pkt_work_stats(struct stats_data *data, + uint16_t total_length) +{ + data->n_packets++; + data->n_bytes += total_length; +} + +/** + * RTE_TABLE_ACTION_TIME + */ +struct time_data { + uint64_t time; +} __attribute__((__packed__)); + +static int +time_apply(struct time_data *data, + struct rte_table_action_time_params *p) +{ + data->time = p->time; + return 0; +} + +static __rte_always_inline void +pkt_work_time(struct time_data *data, + uint64_t time) +{ + data->time = time; +} + +/** + * Action profile + */ +static int +action_valid(enum rte_table_action_type action) +{ + switch (action) { + case RTE_TABLE_ACTION_FWD: + case RTE_TABLE_ACTION_LB: + case RTE_TABLE_ACTION_MTR: + case RTE_TABLE_ACTION_TM: + case RTE_TABLE_ACTION_ENCAP: + case RTE_TABLE_ACTION_NAT: + case RTE_TABLE_ACTION_TTL: + case RTE_TABLE_ACTION_STATS: + case RTE_TABLE_ACTION_TIME: + return 1; + default: + return 0; + } +} + + +#define RTE_TABLE_ACTION_MAX 64 + +struct ap_config { + uint64_t action_mask; + struct rte_table_action_common_config common; + struct rte_table_action_lb_config lb; + struct rte_table_action_mtr_config mtr; + struct rte_table_action_tm_config tm; + struct rte_table_action_encap_config encap; + struct rte_table_action_nat_config nat; + struct rte_table_action_ttl_config ttl; + struct rte_table_action_stats_config stats; +}; + +static size_t +action_cfg_size(enum rte_table_action_type action) +{ + switch (action) { + case RTE_TABLE_ACTION_LB: + return sizeof(struct rte_table_action_lb_config); + case RTE_TABLE_ACTION_MTR: + return sizeof(struct rte_table_action_mtr_config); + case RTE_TABLE_ACTION_TM: + return sizeof(struct rte_table_action_tm_config); + case RTE_TABLE_ACTION_ENCAP: + return sizeof(struct rte_table_action_encap_config); + case RTE_TABLE_ACTION_NAT: + return sizeof(struct rte_table_action_nat_config); + case RTE_TABLE_ACTION_TTL: + return sizeof(struct rte_table_action_ttl_config); + case RTE_TABLE_ACTION_STATS: + return sizeof(struct rte_table_action_stats_config); + default: + return 0; + } +} + +static void* +action_cfg_get(struct ap_config *ap_config, + enum rte_table_action_type type) +{ + switch (type) { + case RTE_TABLE_ACTION_LB: + return &ap_config->lb; + + case RTE_TABLE_ACTION_MTR: + return &ap_config->mtr; + + case RTE_TABLE_ACTION_TM: + return &ap_config->tm; + + case RTE_TABLE_ACTION_ENCAP: + return &ap_config->encap; + + case RTE_TABLE_ACTION_NAT: + return &ap_config->nat; + + case RTE_TABLE_ACTION_TTL: + return &ap_config->ttl; + + case RTE_TABLE_ACTION_STATS: + return &ap_config->stats; + + default: + return NULL; + } +} + +static void +action_cfg_set(struct ap_config *ap_config, + enum rte_table_action_type type, + void *action_cfg) +{ + void *dst = action_cfg_get(ap_config, type); + + if (dst) + memcpy(dst, action_cfg, action_cfg_size(type)); + + ap_config->action_mask |= 1LLU << type; +} + +struct ap_data { + size_t offset[RTE_TABLE_ACTION_MAX]; + size_t total_size; +}; + +static size_t +action_data_size(enum rte_table_action_type action, + struct ap_config *ap_config) +{ + switch (action) { + case RTE_TABLE_ACTION_FWD: + return sizeof(struct fwd_data); + + case RTE_TABLE_ACTION_LB: + return sizeof(struct lb_data); + + case RTE_TABLE_ACTION_MTR: + return mtr_data_size(&ap_config->mtr); + + case RTE_TABLE_ACTION_TM: + return sizeof(struct tm_data); + + case RTE_TABLE_ACTION_ENCAP: + return encap_data_size(&ap_config->encap); + + case RTE_TABLE_ACTION_NAT: + return nat_data_size(&ap_config->nat, + &ap_config->common); + + case RTE_TABLE_ACTION_TTL: + return sizeof(struct ttl_data); + + case RTE_TABLE_ACTION_STATS: + return sizeof(struct stats_data); + + case RTE_TABLE_ACTION_TIME: + return sizeof(struct time_data); + + default: + return 0; + } +} + + +static void +action_data_offset_set(struct ap_data *ap_data, + struct ap_config *ap_config) +{ + uint64_t action_mask = ap_config->action_mask; + size_t offset; + uint32_t action; + + memset(ap_data->offset, 0, sizeof(ap_data->offset)); + + offset = 0; + for (action = 0; action < RTE_TABLE_ACTION_MAX; action++) + if (action_mask & (1LLU << action)) { + ap_data->offset[action] = offset; + offset += action_data_size((enum rte_table_action_type)action, + ap_config); + } + + ap_data->total_size = offset; +} + +struct rte_table_action_profile { + struct ap_config cfg; + struct ap_data data; + int frozen; +}; + +struct rte_table_action_profile * +rte_table_action_profile_create(struct rte_table_action_common_config *common) +{ + struct rte_table_action_profile *ap; + + /* Check input arguments */ + if (common == NULL) + return NULL; + + /* Memory allocation */ + ap = calloc(1, sizeof(struct rte_table_action_profile)); + if (ap == NULL) + return NULL; + + /* Initialization */ + memcpy(&ap->cfg.common, common, sizeof(*common)); + + return ap; +} + + +int +rte_table_action_profile_action_register(struct rte_table_action_profile *profile, + enum rte_table_action_type type, + void *action_config) +{ + int status; + + /* Check input arguments */ + if ((profile == NULL) || + profile->frozen || + (action_valid(type) == 0) || + (profile->cfg.action_mask & (1LLU << type)) || + ((action_cfg_size(type) == 0) && action_config) || + (action_cfg_size(type) && (action_config == NULL))) + return -EINVAL; + + switch (type) { + case RTE_TABLE_ACTION_LB: + status = lb_cfg_check(action_config); + break; + + case RTE_TABLE_ACTION_MTR: + status = mtr_cfg_check(action_config); + break; + + case RTE_TABLE_ACTION_TM: + status = tm_cfg_check(action_config); + break; + + case RTE_TABLE_ACTION_ENCAP: + status = encap_cfg_check(action_config); + break; + + case RTE_TABLE_ACTION_NAT: + status = nat_cfg_check(action_config); + break; + + case RTE_TABLE_ACTION_TTL: + status = ttl_cfg_check(action_config); + break; + + case RTE_TABLE_ACTION_STATS: + status = stats_cfg_check(action_config); + break; + + default: + status = 0; + break; + } + + if (status) + return status; + + /* Action enable */ + action_cfg_set(&profile->cfg, type, action_config); + + return 0; +} + +int +rte_table_action_profile_freeze(struct rte_table_action_profile *profile) +{ + if (profile->frozen) + return -EBUSY; + + profile->cfg.action_mask |= 1LLU << RTE_TABLE_ACTION_FWD; + action_data_offset_set(&profile->data, &profile->cfg); + profile->frozen = 1; + + return 0; +} + +int +rte_table_action_profile_free(struct rte_table_action_profile *profile) +{ + if (profile == NULL) + return 0; + + free(profile); + return 0; +} + +/** + * Action + */ +#define METER_PROFILES_MAX 32 + +struct rte_table_action { + struct ap_config cfg; + struct ap_data data; + struct dscp_table_data dscp_table; + struct meter_profile_data mp[METER_PROFILES_MAX]; +}; + +struct rte_table_action * +rte_table_action_create(struct rte_table_action_profile *profile, + uint32_t socket_id) +{ + struct rte_table_action *action; + + /* Check input arguments */ + if ((profile == NULL) || + (profile->frozen == 0)) + return NULL; + + /* Memory allocation */ + action = rte_zmalloc_socket(NULL, + sizeof(struct rte_table_action), + RTE_CACHE_LINE_SIZE, + socket_id); + if (action == NULL) + return NULL; + + /* Initialization */ + memcpy(&action->cfg, &profile->cfg, sizeof(profile->cfg)); + memcpy(&action->data, &profile->data, sizeof(profile->data)); + + return action; +} + +static __rte_always_inline void * +action_data_get(void *data, + struct rte_table_action *action, + enum rte_table_action_type type) +{ + size_t offset = action->data.offset[type]; + uint8_t *data_bytes = data; + + return &data_bytes[offset]; +} + +int +rte_table_action_apply(struct rte_table_action *action, + void *data, + enum rte_table_action_type type, + void *action_params) +{ + void *action_data; + + /* Check input arguments */ + if ((action == NULL) || + (data == NULL) || + (action_valid(type) == 0) || + ((action->cfg.action_mask & (1LLU << type)) == 0) || + (action_params == NULL)) + return -EINVAL; + + /* Data update */ + action_data = action_data_get(data, action, type); + + switch (type) { + case RTE_TABLE_ACTION_FWD: + return fwd_apply(action_data, + action_params); + + case RTE_TABLE_ACTION_LB: + return lb_apply(action_data, + action_params); + + case RTE_TABLE_ACTION_MTR: + return mtr_apply(action_data, + action_params, + &action->cfg.mtr, + action->mp, + RTE_DIM(action->mp)); + + case RTE_TABLE_ACTION_TM: + return tm_apply(action_data, + action_params, + &action->cfg.tm); + + case RTE_TABLE_ACTION_ENCAP: + return encap_apply(action_data, + action_params, + &action->cfg.encap, + &action->cfg.common); + + case RTE_TABLE_ACTION_NAT: + return nat_apply(action_data, + action_params, + &action->cfg.common); + + case RTE_TABLE_ACTION_TTL: + return ttl_apply(action_data, + action_params); + + case RTE_TABLE_ACTION_STATS: + return stats_apply(action_data, + action_params); + + case RTE_TABLE_ACTION_TIME: + return time_apply(action_data, + action_params); + + default: + return -EINVAL; + } +} + +int +rte_table_action_dscp_table_update(struct rte_table_action *action, + uint64_t dscp_mask, + struct rte_table_action_dscp_table *table) +{ + uint32_t i; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & ((1LLU << RTE_TABLE_ACTION_MTR) | + (1LLU << RTE_TABLE_ACTION_TM))) == 0) || + (dscp_mask == 0) || + (table == NULL)) + return -EINVAL; + + for (i = 0; i < RTE_DIM(table->entry); i++) { + struct dscp_table_entry_data *data = + &action->dscp_table.entry[i]; + struct rte_table_action_dscp_table_entry *entry = + &table->entry[i]; + uint16_t queue_tc_color = + MBUF_SCHED_QUEUE_TC_COLOR(entry->tc_queue_id, + entry->tc_id, + entry->color); + + if ((dscp_mask & (1LLU << i)) == 0) + continue; + + data->color = entry->color; + data->tc = entry->tc_id; + data->queue_tc_color = queue_tc_color; + } + + return 0; +} + +int +rte_table_action_meter_profile_add(struct rte_table_action *action, + uint32_t meter_profile_id, + struct rte_table_action_meter_profile *profile) +{ + struct meter_profile_data *mp_data; + uint32_t status; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & (1LLU << RTE_TABLE_ACTION_MTR)) == 0) || + (profile == NULL)) + return -EINVAL; + + if (profile->alg != RTE_TABLE_ACTION_METER_TRTCM) + return -ENOTSUP; + + mp_data = meter_profile_data_find(action->mp, + RTE_DIM(action->mp), + meter_profile_id); + if (mp_data) + return -EEXIST; + + mp_data = meter_profile_data_find_unused(action->mp, + RTE_DIM(action->mp)); + if (!mp_data) + return -ENOSPC; + + /* Install new profile */ + status = rte_meter_trtcm_profile_config(&mp_data->profile, + &profile->trtcm); + if (status) + return status; + + mp_data->profile_id = meter_profile_id; + mp_data->valid = 1; + + return 0; +} + +int +rte_table_action_meter_profile_delete(struct rte_table_action *action, + uint32_t meter_profile_id) +{ + struct meter_profile_data *mp_data; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & (1LLU << RTE_TABLE_ACTION_MTR)) == 0)) + return -EINVAL; + + mp_data = meter_profile_data_find(action->mp, + RTE_DIM(action->mp), + meter_profile_id); + if (!mp_data) + return 0; + + /* Uninstall profile */ + mp_data->valid = 0; + + return 0; +} + +int +rte_table_action_meter_read(struct rte_table_action *action, + void *data, + uint32_t tc_mask, + struct rte_table_action_mtr_counters *stats, + int clear) +{ + struct mtr_trtcm_data *mtr_data; + uint32_t i; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & (1LLU << RTE_TABLE_ACTION_MTR)) == 0) || + (data == NULL) || + (tc_mask > RTE_LEN2MASK(action->cfg.mtr.n_tc, uint32_t))) + return -EINVAL; + + mtr_data = action_data_get(data, action, RTE_TABLE_ACTION_MTR); + + /* Read */ + if (stats) { + for (i = 0; i < RTE_TABLE_ACTION_TC_MAX; i++) { + struct rte_table_action_mtr_counters_tc *dst = + &stats->stats[i]; + struct mtr_trtcm_data *src = &mtr_data[i]; + + if ((tc_mask & (1 << i)) == 0) + continue; + + dst->n_packets[e_RTE_METER_GREEN] = + mtr_trtcm_data_stats_get(src, e_RTE_METER_GREEN); + + dst->n_packets[e_RTE_METER_YELLOW] = + mtr_trtcm_data_stats_get(src, e_RTE_METER_YELLOW); + + dst->n_packets[e_RTE_METER_RED] = + mtr_trtcm_data_stats_get(src, e_RTE_METER_RED); + + dst->n_packets_valid = 1; + dst->n_bytes_valid = 0; + } + + stats->tc_mask = tc_mask; + } + + /* Clear */ + if (clear) + for (i = 0; i < RTE_TABLE_ACTION_TC_MAX; i++) { + struct mtr_trtcm_data *src = &mtr_data[i]; + + if ((tc_mask & (1 << i)) == 0) + continue; + + mtr_trtcm_data_stats_reset(src, e_RTE_METER_GREEN); + mtr_trtcm_data_stats_reset(src, e_RTE_METER_YELLOW); + mtr_trtcm_data_stats_reset(src, e_RTE_METER_RED); + } + + + return 0; +} + +int +rte_table_action_ttl_read(struct rte_table_action *action, + void *data, + struct rte_table_action_ttl_counters *stats, + int clear) +{ + struct ttl_data *ttl_data; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & + (1LLU << RTE_TABLE_ACTION_TTL)) == 0) || + (data == NULL)) + return -EINVAL; + + ttl_data = action_data_get(data, action, RTE_TABLE_ACTION_TTL); + + /* Read */ + if (stats) + stats->n_packets = TTL_STATS_READ(ttl_data); + + /* Clear */ + if (clear) + TTL_STATS_RESET(ttl_data); + + return 0; +} + +int +rte_table_action_stats_read(struct rte_table_action *action, + void *data, + struct rte_table_action_stats_counters *stats, + int clear) +{ + struct stats_data *stats_data; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & + (1LLU << RTE_TABLE_ACTION_STATS)) == 0) || + (data == NULL)) + return -EINVAL; + + stats_data = action_data_get(data, action, + RTE_TABLE_ACTION_STATS); + + /* Read */ + if (stats) { + stats->n_packets = stats_data->n_packets; + stats->n_bytes = stats_data->n_bytes; + stats->n_packets_valid = 1; + stats->n_bytes_valid = 1; + } + + /* Clear */ + if (clear) { + stats_data->n_packets = 0; + stats_data->n_bytes = 0; + } + + return 0; +} + +int +rte_table_action_time_read(struct rte_table_action *action, + void *data, + uint64_t *timestamp) +{ + struct time_data *time_data; + + /* Check input arguments */ + if ((action == NULL) || + ((action->cfg.action_mask & + (1LLU << RTE_TABLE_ACTION_TIME)) == 0) || + (data == NULL) || + (timestamp == NULL)) + return -EINVAL; + + time_data = action_data_get(data, action, RTE_TABLE_ACTION_TIME); + + /* Read */ + *timestamp = time_data->time; + + return 0; +} + +static __rte_always_inline uint64_t +pkt_work(struct rte_mbuf *mbuf, + struct rte_pipeline_table_entry *table_entry, + uint64_t time, + struct rte_table_action *action, + struct ap_config *cfg) +{ + uint64_t drop_mask = 0; + + uint32_t ip_offset = action->cfg.common.ip_offset; + void *ip = RTE_MBUF_METADATA_UINT32_PTR(mbuf, ip_offset); + + uint32_t dscp; + uint16_t total_length; + + if (cfg->common.ip_version) { + struct ipv4_hdr *hdr = ip; + + dscp = hdr->type_of_service >> 2; + total_length = rte_ntohs(hdr->total_length); + } else { + struct ipv6_hdr *hdr = ip; + + dscp = (rte_ntohl(hdr->vtc_flow) & 0x0F600000) >> 18; + total_length = + rte_ntohs(hdr->payload_len) + sizeof(struct ipv6_hdr); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_LB)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_LB); + + pkt_work_lb(mbuf, + data, + &cfg->lb); + } + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_MTR)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_MTR); + + drop_mask |= pkt_work_mtr(mbuf, + data, + &action->dscp_table, + action->mp, + time, + dscp, + total_length); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TM)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_TM); + + pkt_work_tm(mbuf, + data, + &action->dscp_table, + dscp); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_ENCAP)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_ENCAP); + + pkt_work_encap(mbuf, + data, + &cfg->encap, + ip, + total_length, + ip_offset); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_NAT)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_NAT); + + if (cfg->common.ip_version) + pkt_ipv4_work_nat(ip, data, &cfg->nat); + else + pkt_ipv6_work_nat(ip, data, &cfg->nat); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TTL)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_TTL); + + if (cfg->common.ip_version) + drop_mask |= pkt_ipv4_work_ttl(ip, data); + else + drop_mask |= pkt_ipv6_work_ttl(ip, data); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_STATS)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_STATS); + + pkt_work_stats(data, total_length); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TIME)) { + void *data = + action_data_get(table_entry, action, RTE_TABLE_ACTION_TIME); + + pkt_work_time(data, time); + } + + return drop_mask; +} + +static __rte_always_inline uint64_t +pkt4_work(struct rte_mbuf **mbufs, + struct rte_pipeline_table_entry **table_entries, + uint64_t time, + struct rte_table_action *action, + struct ap_config *cfg) +{ + uint64_t drop_mask0 = 0; + uint64_t drop_mask1 = 0; + uint64_t drop_mask2 = 0; + uint64_t drop_mask3 = 0; + + struct rte_mbuf *mbuf0 = mbufs[0]; + struct rte_mbuf *mbuf1 = mbufs[1]; + struct rte_mbuf *mbuf2 = mbufs[2]; + struct rte_mbuf *mbuf3 = mbufs[3]; + + struct rte_pipeline_table_entry *table_entry0 = table_entries[0]; + struct rte_pipeline_table_entry *table_entry1 = table_entries[1]; + struct rte_pipeline_table_entry *table_entry2 = table_entries[2]; + struct rte_pipeline_table_entry *table_entry3 = table_entries[3]; + + uint32_t ip_offset = action->cfg.common.ip_offset; + void *ip0 = RTE_MBUF_METADATA_UINT32_PTR(mbuf0, ip_offset); + void *ip1 = RTE_MBUF_METADATA_UINT32_PTR(mbuf1, ip_offset); + void *ip2 = RTE_MBUF_METADATA_UINT32_PTR(mbuf2, ip_offset); + void *ip3 = RTE_MBUF_METADATA_UINT32_PTR(mbuf3, ip_offset); + + uint32_t dscp0, dscp1, dscp2, dscp3; + uint16_t total_length0, total_length1, total_length2, total_length3; + + if (cfg->common.ip_version) { + struct ipv4_hdr *hdr0 = ip0; + struct ipv4_hdr *hdr1 = ip1; + struct ipv4_hdr *hdr2 = ip2; + struct ipv4_hdr *hdr3 = ip3; + + dscp0 = hdr0->type_of_service >> 2; + dscp1 = hdr1->type_of_service >> 2; + dscp2 = hdr2->type_of_service >> 2; + dscp3 = hdr3->type_of_service >> 2; + + total_length0 = rte_ntohs(hdr0->total_length); + total_length1 = rte_ntohs(hdr1->total_length); + total_length2 = rte_ntohs(hdr2->total_length); + total_length3 = rte_ntohs(hdr3->total_length); + } else { + struct ipv6_hdr *hdr0 = ip0; + struct ipv6_hdr *hdr1 = ip1; + struct ipv6_hdr *hdr2 = ip2; + struct ipv6_hdr *hdr3 = ip3; + + dscp0 = (rte_ntohl(hdr0->vtc_flow) & 0x0F600000) >> 18; + dscp1 = (rte_ntohl(hdr1->vtc_flow) & 0x0F600000) >> 18; + dscp2 = (rte_ntohl(hdr2->vtc_flow) & 0x0F600000) >> 18; + dscp3 = (rte_ntohl(hdr3->vtc_flow) & 0x0F600000) >> 18; + + total_length0 = + rte_ntohs(hdr0->payload_len) + sizeof(struct ipv6_hdr); + total_length1 = + rte_ntohs(hdr1->payload_len) + sizeof(struct ipv6_hdr); + total_length2 = + rte_ntohs(hdr2->payload_len) + sizeof(struct ipv6_hdr); + total_length3 = + rte_ntohs(hdr3->payload_len) + sizeof(struct ipv6_hdr); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_LB)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_LB); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_LB); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_LB); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_LB); + + pkt_work_lb(mbuf0, + data0, + &cfg->lb); + + pkt_work_lb(mbuf1, + data1, + &cfg->lb); + + pkt_work_lb(mbuf2, + data2, + &cfg->lb); + + pkt_work_lb(mbuf3, + data3, + &cfg->lb); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_MTR)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_MTR); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_MTR); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_MTR); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_MTR); + + drop_mask0 |= pkt_work_mtr(mbuf0, + data0, + &action->dscp_table, + action->mp, + time, + dscp0, + total_length0); + + drop_mask1 |= pkt_work_mtr(mbuf1, + data1, + &action->dscp_table, + action->mp, + time, + dscp1, + total_length1); + + drop_mask2 |= pkt_work_mtr(mbuf2, + data2, + &action->dscp_table, + action->mp, + time, + dscp2, + total_length2); + + drop_mask3 |= pkt_work_mtr(mbuf3, + data3, + &action->dscp_table, + action->mp, + time, + dscp3, + total_length3); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TM)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_TM); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_TM); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_TM); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_TM); + + pkt_work_tm(mbuf0, + data0, + &action->dscp_table, + dscp0); + + pkt_work_tm(mbuf1, + data1, + &action->dscp_table, + dscp1); + + pkt_work_tm(mbuf2, + data2, + &action->dscp_table, + dscp2); + + pkt_work_tm(mbuf3, + data3, + &action->dscp_table, + dscp3); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_ENCAP)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_ENCAP); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_ENCAP); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_ENCAP); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_ENCAP); + + pkt_work_encap(mbuf0, + data0, + &cfg->encap, + ip0, + total_length0, + ip_offset); + + pkt_work_encap(mbuf1, + data1, + &cfg->encap, + ip1, + total_length1, + ip_offset); + + pkt_work_encap(mbuf2, + data2, + &cfg->encap, + ip2, + total_length2, + ip_offset); + + pkt_work_encap(mbuf3, + data3, + &cfg->encap, + ip3, + total_length3, + ip_offset); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_NAT)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_NAT); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_NAT); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_NAT); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_NAT); + + if (cfg->common.ip_version) { + pkt_ipv4_work_nat(ip0, data0, &cfg->nat); + pkt_ipv4_work_nat(ip1, data1, &cfg->nat); + pkt_ipv4_work_nat(ip2, data2, &cfg->nat); + pkt_ipv4_work_nat(ip3, data3, &cfg->nat); + } else { + pkt_ipv6_work_nat(ip0, data0, &cfg->nat); + pkt_ipv6_work_nat(ip1, data1, &cfg->nat); + pkt_ipv6_work_nat(ip2, data2, &cfg->nat); + pkt_ipv6_work_nat(ip3, data3, &cfg->nat); + } + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TTL)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_TTL); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_TTL); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_TTL); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_TTL); + + if (cfg->common.ip_version) { + drop_mask0 |= pkt_ipv4_work_ttl(ip0, data0); + drop_mask1 |= pkt_ipv4_work_ttl(ip1, data1); + drop_mask2 |= pkt_ipv4_work_ttl(ip2, data2); + drop_mask3 |= pkt_ipv4_work_ttl(ip3, data3); + } else { + drop_mask0 |= pkt_ipv6_work_ttl(ip0, data0); + drop_mask1 |= pkt_ipv6_work_ttl(ip1, data1); + drop_mask2 |= pkt_ipv6_work_ttl(ip2, data2); + drop_mask3 |= pkt_ipv6_work_ttl(ip3, data3); + } + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_STATS)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_STATS); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_STATS); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_STATS); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_STATS); + + pkt_work_stats(data0, total_length0); + pkt_work_stats(data1, total_length1); + pkt_work_stats(data2, total_length2); + pkt_work_stats(data3, total_length3); + } + + if (cfg->action_mask & (1LLU << RTE_TABLE_ACTION_TIME)) { + void *data0 = + action_data_get(table_entry0, action, RTE_TABLE_ACTION_TIME); + void *data1 = + action_data_get(table_entry1, action, RTE_TABLE_ACTION_TIME); + void *data2 = + action_data_get(table_entry2, action, RTE_TABLE_ACTION_TIME); + void *data3 = + action_data_get(table_entry3, action, RTE_TABLE_ACTION_TIME); + + pkt_work_time(data0, time); + pkt_work_time(data1, time); + pkt_work_time(data2, time); + pkt_work_time(data3, time); + } + + return drop_mask0 | + (drop_mask1 << 1) | + (drop_mask2 << 2) | + (drop_mask3 << 3); +} + +static __rte_always_inline int +ah(struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_pipeline_table_entry **entries, + struct rte_table_action *action, + struct ap_config *cfg) +{ + uint64_t pkts_drop_mask = 0; + uint64_t time = 0; + + if (cfg->action_mask & ((1LLU << RTE_TABLE_ACTION_MTR) | + (1LLU << RTE_TABLE_ACTION_TIME))) + time = rte_rdtsc(); + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < (n_pkts & (~0x3LLU)); i += 4) { + uint64_t drop_mask; + + drop_mask = pkt4_work(&pkts[i], + &entries[i], + time, + action, + cfg); + + pkts_drop_mask |= drop_mask << i; + } + + for ( ; i < n_pkts; i++) { + uint64_t drop_mask; + + drop_mask = pkt_work(pkts[i], + entries[i], + time, + action, + cfg); + + pkts_drop_mask |= drop_mask << i; + } + } else + for ( ; pkts_mask; ) { + uint32_t pos = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pos; + uint64_t drop_mask; + + drop_mask = pkt_work(pkts[pos], + entries[pos], + time, + action, + cfg); + + pkts_mask &= ~pkt_mask; + pkts_drop_mask |= drop_mask << pos; + } + + rte_pipeline_ah_packet_drop(p, pkts_drop_mask); + + return 0; +} + +static int +ah_default(struct rte_pipeline *p, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_pipeline_table_entry **entries, + void *arg) +{ + struct rte_table_action *action = arg; + + return ah(p, + pkts, + pkts_mask, + entries, + action, + &action->cfg); +} + +static rte_pipeline_table_action_handler_hit +ah_selector(struct rte_table_action *action) +{ + if (action->cfg.action_mask == (1LLU << RTE_TABLE_ACTION_FWD)) + return NULL; + + return ah_default; +} + +int +rte_table_action_table_params_get(struct rte_table_action *action, + struct rte_pipeline_table_params *params) +{ + rte_pipeline_table_action_handler_hit f_action_hit; + uint32_t total_size; + + /* Check input arguments */ + if ((action == NULL) || + (params == NULL)) + return -EINVAL; + + f_action_hit = ah_selector(action); + total_size = rte_align32pow2(action->data.total_size); + + /* Fill in params */ + params->f_action_hit = f_action_hit; + params->f_action_miss = NULL; + params->arg_ah = (f_action_hit) ? action : NULL; + params->action_data_size = total_size - + sizeof(struct rte_pipeline_table_entry); + + return 0; +} + +int +rte_table_action_free(struct rte_table_action *action) +{ + if (action == NULL) + return 0; + + rte_free(action); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_pipeline/rte_table_action.h b/src/spdk/dpdk/lib/librte_pipeline/rte_table_action.h new file mode 100644 index 00000000..c7f751aa --- /dev/null +++ b/src/spdk/dpdk/lib/librte_pipeline/rte_table_action.h @@ -0,0 +1,905 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_ACTION_H__ +#define __INCLUDE_RTE_TABLE_ACTION_H__ + +/** + * @file + * RTE Pipeline Table Actions + * + * This API provides a common set of actions for pipeline tables to speed up + * application development. + * + * Each match-action rule added to a pipeline table has associated data that + * stores the action context. This data is input to the table action handler + * called for every input packet that hits the rule as part of the table lookup + * during the pipeline execution. The pipeline library allows the user to define + * his own table actions by providing customized table action handlers (table + * lookup) and complete freedom of setting the rules and their data (table rule + * add/delete). While the user can still follow this process, this API is + * intended to provide a quicker development alternative for a set of predefined + * actions. + * + * The typical steps to use this API are: + * - Define a table action profile. This is a configuration template that can + * potentially be shared by multiple tables from the same or different + * pipelines, with different tables from the same pipeline likely to use + * different action profiles. For every table using a given action profile, + * the profile defines the set of actions and the action configuration to be + * implemented for all the table rules. API functions: + * rte_table_action_profile_create(), + * rte_table_action_profile_action_register(), + * rte_table_action_profile_freeze(). + * + * - Instantiate the table action profile to create table action objects. Each + * pipeline table has its own table action object. API functions: + * rte_table_action_create(). + * + * - Use the table action object to generate the pipeline table action handlers + * (invoked by the pipeline table lookup operation). API functions: + * rte_table_action_table_params_get(). + * + * - Use the table action object to generate the rule data (for the pipeline + * table rule add operation) based on given action parameters. API functions: + * rte_table_action_apply(). + * + * - Use the table action object to read action data (e.g. stats counters) for + * any given rule. API functions: rte_table_action_XYZ_read(). + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include + +#include "rte_pipeline.h" + +/** Table actions. */ +enum rte_table_action_type { + /** Forward to next pipeline table, output port or drop. */ + RTE_TABLE_ACTION_FWD = 0, + + /** Load balance. */ + RTE_TABLE_ACTION_LB, + + /** Traffic Metering and Policing. */ + RTE_TABLE_ACTION_MTR, + + /** Traffic Management. */ + RTE_TABLE_ACTION_TM, + + /** Packet encapsulations. */ + RTE_TABLE_ACTION_ENCAP, + + /** Network Address Translation (NAT). */ + RTE_TABLE_ACTION_NAT, + + /** Time to Live (TTL) update. */ + RTE_TABLE_ACTION_TTL, + + /** Statistics. */ + RTE_TABLE_ACTION_STATS, + + /** Timestamp. */ + RTE_TABLE_ACTION_TIME, +}; + +/** Common action configuration (per table action profile). */ +struct rte_table_action_common_config { + /** Input packet Internet Protocol (IP) version. Non-zero for IPv4, zero + * for IPv6. + */ + int ip_version; + + /** IP header offset within the input packet buffer. Offset 0 points to + * the first byte of the MBUF structure. + */ + uint32_t ip_offset; +}; + +/** + * RTE_TABLE_ACTION_FWD + */ +/** Forward action parameters (per table rule). */ +struct rte_table_action_fwd_params { + /** Forward action. */ + enum rte_pipeline_action action; + + /** Pipeline table ID or output port ID. */ + uint32_t id; +}; + +/** + * RTE_TABLE_ACTION_LB + */ +/** Load balance key size min (number of bytes). */ +#define RTE_TABLE_ACTION_LB_KEY_SIZE_MIN 8 + +/** Load balance key size max (number of bytes). */ +#define RTE_TABLE_ACTION_LB_KEY_SIZE_MAX 64 + +/** Load balance table size. */ +#define RTE_TABLE_ACTION_LB_TABLE_SIZE 8 + +/** Load balance action configuration (per table action profile). */ +struct rte_table_action_lb_config { + /** Key size (number of bytes). */ + uint32_t key_size; + + /** Key offset within the input packet buffer. Offset 0 points to the + * first byte of the MBUF structure. + */ + uint32_t key_offset; + + /** Key mask (*key_size* bytes are valid). */ + uint8_t key_mask[RTE_TABLE_ACTION_LB_KEY_SIZE_MAX]; + + /** Hash function. */ + rte_table_hash_op_hash f_hash; + + /** Seed value for *f_hash*. */ + uint64_t seed; + + /** Output value offset within the input packet buffer. Offset 0 points + * to the first byte of the MBUF structure. + */ + uint32_t out_offset; +}; + +/** Load balance action parameters (per table rule). */ +struct rte_table_action_lb_params { + /** Table defining the output values and their weights. The weights are + * set in 1/RTE_TABLE_ACTION_LB_TABLE_SIZE increments. To assign a + * weight of N/RTE_TABLE_ACTION_LB_TABLE_SIZE to a given output value + * (0 <= N <= RTE_TABLE_ACTION_LB_TABLE_SIZE), the same output value + * needs to show up exactly N times in this table. + */ + uint32_t out[RTE_TABLE_ACTION_LB_TABLE_SIZE]; +}; + +/** + * RTE_TABLE_ACTION_MTR + */ +/** Max number of traffic classes (TCs). */ +#define RTE_TABLE_ACTION_TC_MAX 4 + +/** Max number of queues per traffic class. */ +#define RTE_TABLE_ACTION_TC_QUEUE_MAX 4 + +/** Differentiated Services Code Point (DSCP) translation table entry. */ +struct rte_table_action_dscp_table_entry { + /** Traffic class. Used by the meter or the traffic management actions. + * Has to be strictly smaller than *RTE_TABLE_ACTION_TC_MAX*. Traffic + * class 0 is the highest priority. + */ + uint32_t tc_id; + + /** Traffic class queue. Used by the traffic management action. Has to + * be strictly smaller than *RTE_TABLE_ACTION_TC_QUEUE_MAX*. + */ + uint32_t tc_queue_id; + + /** Packet color. Used by the meter action as the packet input color + * for the color aware mode of the traffic metering algorithm. + */ + enum rte_meter_color color; +}; + +/** DSCP translation table. */ +struct rte_table_action_dscp_table { + /** Array of DSCP table entries */ + struct rte_table_action_dscp_table_entry entry[64]; +}; + +/** Supported traffic metering algorithms. */ +enum rte_table_action_meter_algorithm { + /** Single Rate Three Color Marker (srTCM) - IETF RFC 2697. */ + RTE_TABLE_ACTION_METER_SRTCM, + + /** Two Rate Three Color Marker (trTCM) - IETF RFC 2698. */ + RTE_TABLE_ACTION_METER_TRTCM, +}; + +/** Traffic metering profile (configuration template). */ +struct rte_table_action_meter_profile { + /** Traffic metering algorithm. */ + enum rte_table_action_meter_algorithm alg; + + RTE_STD_C11 + union { + /** Only valid when *alg* is set to srTCM - IETF RFC 2697. */ + struct rte_meter_srtcm_params srtcm; + + /** Only valid when *alg* is set to trTCM - IETF RFC 2698. */ + struct rte_meter_trtcm_params trtcm; + }; +}; + +/** Policer actions. */ +enum rte_table_action_policer { + /** Recolor the packet as green. */ + RTE_TABLE_ACTION_POLICER_COLOR_GREEN = 0, + + /** Recolor the packet as yellow. */ + RTE_TABLE_ACTION_POLICER_COLOR_YELLOW, + + /** Recolor the packet as red. */ + RTE_TABLE_ACTION_POLICER_COLOR_RED, + + /** Drop the packet. */ + RTE_TABLE_ACTION_POLICER_DROP, + + /** Number of policer actions. */ + RTE_TABLE_ACTION_POLICER_MAX +}; + +/** Meter action configuration per traffic class. */ +struct rte_table_action_mtr_tc_params { + /** Meter profile ID. */ + uint32_t meter_profile_id; + + /** Policer actions. */ + enum rte_table_action_policer policer[e_RTE_METER_COLORS]; +}; + +/** Meter action statistics counters per traffic class. */ +struct rte_table_action_mtr_counters_tc { + /** Number of packets per color at the output of the traffic metering + * and before the policer actions are executed. Only valid when + * *n_packets_valid* is non-zero. + */ + uint64_t n_packets[e_RTE_METER_COLORS]; + + /** Number of packet bytes per color at the output of the traffic + * metering and before the policer actions are executed. Only valid when + * *n_bytes_valid* is non-zero. + */ + uint64_t n_bytes[e_RTE_METER_COLORS]; + + /** When non-zero, the *n_packets* field is valid. */ + int n_packets_valid; + + /** When non-zero, the *n_bytes* field is valid. */ + int n_bytes_valid; +}; + +/** Meter action configuration (per table action profile). */ +struct rte_table_action_mtr_config { + /** Meter algorithm. */ + enum rte_table_action_meter_algorithm alg; + + /** Number of traffic classes. Each traffic class has its own traffic + * meter and policer instances. Needs to be equal to either 1 or to + * *RTE_TABLE_ACTION_TC_MAX*. + */ + uint32_t n_tc; + + /** When non-zero, the *n_packets* meter stats counter is enabled, + * otherwise it is disabled. + * + * @see struct rte_table_action_mtr_counters_tc + */ + int n_packets_enabled; + + /** When non-zero, the *n_bytes* meter stats counter is enabled, + * otherwise it is disabled. + * + * @see struct rte_table_action_mtr_counters_tc + */ + int n_bytes_enabled; +}; + +/** Meter action parameters (per table rule). */ +struct rte_table_action_mtr_params { + /** Traffic meter and policer parameters for each of the *tc_mask* + * traffic classes. + */ + struct rte_table_action_mtr_tc_params mtr[RTE_TABLE_ACTION_TC_MAX]; + + /** Bit mask defining which traffic class parameters are valid in *mtr*. + * If bit N is set in *tc_mask*, then parameters for traffic class N are + * valid in *mtr*. + */ + uint32_t tc_mask; +}; + +/** Meter action statistics counters (per table rule). */ +struct rte_table_action_mtr_counters { + /** Stats counters for each of the *tc_mask* traffic classes. */ + struct rte_table_action_mtr_counters_tc stats[RTE_TABLE_ACTION_TC_MAX]; + + /** Bit mask defining which traffic class parameters are valid in *mtr*. + * If bit N is set in *tc_mask*, then parameters for traffic class N are + * valid in *mtr*. + */ + uint32_t tc_mask; +}; + +/** + * RTE_TABLE_ACTION_TM + */ +/** Traffic management action configuration (per table action profile). */ +struct rte_table_action_tm_config { + /** Number of subports per port. */ + uint32_t n_subports_per_port; + + /** Number of pipes per subport. */ + uint32_t n_pipes_per_subport; +}; + +/** Traffic management action parameters (per table rule). */ +struct rte_table_action_tm_params { + /** Subport ID. */ + uint32_t subport_id; + + /** Pipe ID. */ + uint32_t pipe_id; +}; + +/** + * RTE_TABLE_ACTION_ENCAP + */ +/** Supported packet encapsulation types. */ +enum rte_table_action_encap_type { + /** IP -> { Ether | IP } */ + RTE_TABLE_ACTION_ENCAP_ETHER = 0, + + /** IP -> { Ether | VLAN | IP } */ + RTE_TABLE_ACTION_ENCAP_VLAN, + + /** IP -> { Ether | S-VLAN | C-VLAN | IP } */ + RTE_TABLE_ACTION_ENCAP_QINQ, + + /** IP -> { Ether | MPLS | IP } */ + RTE_TABLE_ACTION_ENCAP_MPLS, + + /** IP -> { Ether | PPPoE | PPP | IP } */ + RTE_TABLE_ACTION_ENCAP_PPPOE, +}; + +/** Pre-computed Ethernet header fields for encapsulation action. */ +struct rte_table_action_ether_hdr { + struct ether_addr da; /**< Destination address. */ + struct ether_addr sa; /**< Source address. */ +}; + +/** Pre-computed VLAN header fields for encapsulation action. */ +struct rte_table_action_vlan_hdr { + uint8_t pcp; /**< Priority Code Point (PCP). */ + uint8_t dei; /**< Drop Eligibility Indicator (DEI). */ + uint16_t vid; /**< VLAN Identifier (VID). */ +}; + +/** Pre-computed MPLS header fields for encapsulation action. */ +struct rte_table_action_mpls_hdr { + uint32_t label; /**< Label. */ + uint8_t tc; /**< Traffic Class (TC). */ + uint8_t ttl; /**< Time to Live (TTL). */ +}; + +/** Pre-computed PPPoE header fields for encapsulation action. */ +struct rte_table_action_pppoe_hdr { + uint16_t session_id; /**< Session ID. */ +}; + +/** Ether encap parameters. */ +struct rte_table_action_encap_ether_params { + struct rte_table_action_ether_hdr ether; /**< Ethernet header. */ +}; + +/** VLAN encap parameters. */ +struct rte_table_action_encap_vlan_params { + struct rte_table_action_ether_hdr ether; /**< Ethernet header. */ + struct rte_table_action_vlan_hdr vlan; /**< VLAN header. */ +}; + +/** QinQ encap parameters. */ +struct rte_table_action_encap_qinq_params { + struct rte_table_action_ether_hdr ether; /**< Ethernet header. */ + struct rte_table_action_vlan_hdr svlan; /**< Service VLAN header. */ + struct rte_table_action_vlan_hdr cvlan; /**< Customer VLAN header. */ +}; + +/** Max number of MPLS labels per output packet for MPLS encapsulation. */ +#ifndef RTE_TABLE_ACTION_MPLS_LABELS_MAX +#define RTE_TABLE_ACTION_MPLS_LABELS_MAX 4 +#endif + +/** MPLS encap parameters. */ +struct rte_table_action_encap_mpls_params { + /** Ethernet header. */ + struct rte_table_action_ether_hdr ether; + + /** MPLS header. */ + struct rte_table_action_mpls_hdr mpls[RTE_TABLE_ACTION_MPLS_LABELS_MAX]; + + /** Number of MPLS labels in MPLS header. */ + uint32_t mpls_count; + + /** Non-zero for MPLS unicast, zero for MPLS multicast. */ + int unicast; +}; + +/** PPPoE encap parameters. */ +struct rte_table_action_encap_pppoe_params { + struct rte_table_action_ether_hdr ether; /**< Ethernet header. */ + struct rte_table_action_pppoe_hdr pppoe; /**< PPPoE/PPP headers. */ +}; + +/** Encap action configuration (per table action profile). */ +struct rte_table_action_encap_config { + /** Bit mask defining the set of packet encapsulations enabled for the + * current table action profile. If bit (1 << N) is set in *encap_mask*, + * then packet encapsulation N is enabled, otherwise it is disabled. + * + * @see enum rte_table_action_encap_type + */ + uint64_t encap_mask; +}; + +/** Encap action parameters (per table rule). */ +struct rte_table_action_encap_params { + /** Encapsulation type. */ + enum rte_table_action_encap_type type; + + RTE_STD_C11 + union { + /** Only valid when *type* is set to Ether. */ + struct rte_table_action_encap_ether_params ether; + + /** Only valid when *type* is set to VLAN. */ + struct rte_table_action_encap_vlan_params vlan; + + /** Only valid when *type* is set to QinQ. */ + struct rte_table_action_encap_qinq_params qinq; + + /** Only valid when *type* is set to MPLS. */ + struct rte_table_action_encap_mpls_params mpls; + + /** Only valid when *type* is set to PPPoE. */ + struct rte_table_action_encap_pppoe_params pppoe; + }; +}; + +/** + * RTE_TABLE_ACTION_NAT + */ +/** NAT action configuration (per table action profile). */ +struct rte_table_action_nat_config { + /** When non-zero, the IP source address and L4 protocol source port are + * translated. When zero, the IP destination address and L4 protocol + * destination port are translated. + */ + int source_nat; + + /** Layer 4 protocol, for example TCP (0x06) or UDP (0x11). The checksum + * field is computed differently and placed at different header offset + * by each layer 4 protocol. + */ + uint8_t proto; +}; + +/** NAT action parameters (per table rule). */ +struct rte_table_action_nat_params { + /** IP version for *addr*: non-zero for IPv4, zero for IPv6. */ + int ip_version; + + /** IP address. */ + union { + /** IPv4 address; only valid when *ip_version* is non-zero. */ + uint32_t ipv4; + + /** IPv6 address; only valid when *ip_version* is set to 0. */ + uint8_t ipv6[16]; + } addr; + + /** Port. */ + uint16_t port; +}; + +/** + * RTE_TABLE_ACTION_TTL + */ +/** TTL action configuration (per table action profile). */ +struct rte_table_action_ttl_config { + /** When non-zero, the input packets whose updated IPv4 Time to Live + * (TTL) field or IPv6 Hop Limit (HL) field is zero are dropped. + * When zero, the input packets whose updated IPv4 TTL field or IPv6 HL + * field is zero are forwarded as usual (typically for debugging + * purpose). + */ + int drop; + + /** When non-zero, the *n_packets* stats counter for TTL action is + * enabled, otherwise disabled. + * + * @see struct rte_table_action_ttl_counters + */ + int n_packets_enabled; +}; + +/** TTL action parameters (per table rule). */ +struct rte_table_action_ttl_params { + /** When non-zero, decrement the IPv4 TTL field and update the checksum + * field, or decrement the IPv6 HL field. When zero, the IPv4 TTL field + * or the IPv6 HL field is not changed. + */ + int decrement; +}; + +/** TTL action statistics packets (per table rule). */ +struct rte_table_action_ttl_counters { + /** Number of IPv4 packets whose updated TTL field is zero or IPv6 + * packets whose updated HL field is zero. + */ + uint64_t n_packets; +}; + +/** + * RTE_TABLE_ACTION_STATS + */ +/** Stats action configuration (per table action profile). */ +struct rte_table_action_stats_config { + /** When non-zero, the *n_packets* stats counter is enabled, otherwise + * disabled. + * + * @see struct rte_table_action_stats_counters + */ + int n_packets_enabled; + + /** When non-zero, the *n_bytes* stats counter is enabled, otherwise + * disabled. + * + * @see struct rte_table_action_stats_counters + */ + int n_bytes_enabled; +}; + +/** Stats action parameters (per table rule). */ +struct rte_table_action_stats_params { + /** Initial value for the *n_packets* stats counter. Typically set to 0. + * + * @see struct rte_table_action_stats_counters + */ + uint64_t n_packets; + + /** Initial value for the *n_bytes* stats counter. Typically set to 0. + * + * @see struct rte_table_action_stats_counters + */ + uint64_t n_bytes; +}; + +/** Stats action counters (per table rule). */ +struct rte_table_action_stats_counters { + /** Number of packets. Valid only when *n_packets_valid* is non-zero. */ + uint64_t n_packets; + + /** Number of bytes. Valid only when *n_bytes_valid* is non-zero. */ + uint64_t n_bytes; + + /** When non-zero, the *n_packets* field is valid, otherwise invalid. */ + int n_packets_valid; + + /** When non-zero, the *n_bytes* field is valid, otherwise invalid. */ + int n_bytes_valid; +}; + +/** + * RTE_TABLE_ACTION_TIME + */ +/** Timestamp action parameters (per table rule). */ +struct rte_table_action_time_params { + /** Initial timestamp value. Typically set to current time. */ + uint64_t time; +}; + +/** + * Table action profile. + */ +struct rte_table_action_profile; + +/** + * Table action profile create. + * + * @param[in] common + * Common action configuration. + * @return + * Table action profile handle on success, NULL otherwise. + */ +struct rte_table_action_profile * __rte_experimental +rte_table_action_profile_create(struct rte_table_action_common_config *common); + +/** + * Table action profile free. + * + * @param[in] profile + * Table profile action handle (needs to be valid). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_profile_free(struct rte_table_action_profile *profile); + +/** + * Table action profile action register. + * + * @param[in] profile + * Table profile action handle (needs to be valid and not in frozen state). + * @param[in] type + * Specific table action to be registered for *profile*. + * @param[in] action_config + * Configuration for the *type* action. + * If struct rte_table_action_*type*_config is defined by the Table Action + * API, it needs to point to a valid instance of this structure, otherwise it + * needs to be set to NULL. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_profile_action_register(struct rte_table_action_profile *profile, + enum rte_table_action_type type, + void *action_config); + +/** + * Table action profile freeze. + * + * Once this function is called successfully, the given profile enters the + * frozen state with the following immediate effects: no more actions can be + * registered for this profile, so the profile can be instantiated to create + * table action objects. + * + * @param[in] profile + * Table profile action handle (needs to be valid and not in frozen state). + * @return + * Zero on success, non-zero error code otherwise. + * + * @see rte_table_action_create() + */ +int __rte_experimental +rte_table_action_profile_freeze(struct rte_table_action_profile *profile); + +/** + * Table action. + */ +struct rte_table_action; + +/** + * Table action create. + * + * Instantiates the given table action profile to create a table action object. + * + * @param[in] profile + * Table profile action handle (needs to be valid and in frozen state). + * @param[in] socket_id + * CPU socket ID where the internal data structures required by the new table + * action object should be allocated. + * @return + * Handle to table action object on success, NULL on error. + * + * @see rte_table_action_create() + */ +struct rte_table_action * __rte_experimental +rte_table_action_create(struct rte_table_action_profile *profile, + uint32_t socket_id); + +/** + * Table action free. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_free(struct rte_table_action *action); + +/** + * Table action table params get. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[inout] params + * Pipeline table parameters (needs to be pre-allocated). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_table_params_get(struct rte_table_action *action, + struct rte_pipeline_table_params *params); + +/** + * Table action apply. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] data + * Data byte array (typically table rule data) to apply action *type* on. + * @param[in] type + * Specific table action previously registered for the table action profile of + * the *action* object. + * @param[in] action_params + * Parameters for the *type* action. + * If struct rte_table_action_*type*_params is defined by the Table Action + * API, it needs to point to a valid instance of this structure, otherwise it + * needs to be set to NULL. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_apply(struct rte_table_action *action, + void *data, + enum rte_table_action_type type, + void *action_params); + +/** + * Table action DSCP table update. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] dscp_mask + * 64-bit mask defining the DSCP table entries to be updated. If bit N is set + * in this bit mask, then DSCP table entry N is to be updated, otherwise not. + * @param[in] table + * DSCP table. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_dscp_table_update(struct rte_table_action *action, + uint64_t dscp_mask, + struct rte_table_action_dscp_table *table); + +/** + * Table action meter profile add. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] meter_profile_id + * Meter profile ID to be used for the *profile* once it is successfully added + * to the *action* object (needs to be unused by the set of meter profiles + * currently registered for the *action* object). + * @param[in] profile + * Meter profile to be added. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_meter_profile_add(struct rte_table_action *action, + uint32_t meter_profile_id, + struct rte_table_action_meter_profile *profile); + +/** + * Table action meter profile delete. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] meter_profile_id + * Meter profile ID of the meter profile to be deleted from the *action* + * object (needs to be valid for the *action* object). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_meter_profile_delete(struct rte_table_action *action, + uint32_t meter_profile_id); + +/** + * Table action meter read. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] data + * Data byte array (typically table rule data) with meter action previously + * applied on it. + * @param[in] tc_mask + * Bit mask defining which traffic classes should have the meter stats + * counters read from *data* and stored into *stats*. If bit N is set in this + * bit mask, then traffic class N is part of this operation, otherwise it is + * not. If bit N is set in this bit mask, then traffic class N must be one of + * the traffic classes that are enabled for the meter action in the table + * action profile used by the *action* object. + * @param[inout] stats + * When non-NULL, it points to the area where the meter stats counters read + * from *data* are saved. Only the meter stats counters for the *tc_mask* + * traffic classes are read and stored to *stats*. + * @param[in] clear + * When non-zero, the meter stats counters are cleared (i.e. set to zero), + * otherwise the counters are not modified. When the read operation is enabled + * (*stats* is non-NULL), the clear operation is performed after the read + * operation is completed. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_meter_read(struct rte_table_action *action, + void *data, + uint32_t tc_mask, + struct rte_table_action_mtr_counters *stats, + int clear); + +/** + * Table action TTL read. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] data + * Data byte array (typically table rule data) with TTL action previously + * applied on it. + * @param[inout] stats + * When non-NULL, it points to the area where the TTL stats counters read from + * *data* are saved. + * @param[in] clear + * When non-zero, the TTL stats counters are cleared (i.e. set to zero), + * otherwise the counters are not modified. When the read operation is enabled + * (*stats* is non-NULL), the clear operation is performed after the read + * operation is completed. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_ttl_read(struct rte_table_action *action, + void *data, + struct rte_table_action_ttl_counters *stats, + int clear); + +/** + * Table action stats read. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] data + * Data byte array (typically table rule data) with stats action previously + * applied on it. + * @param[inout] stats + * When non-NULL, it points to the area where the stats counters read from + * *data* are saved. + * @param[in] clear + * When non-zero, the stats counters are cleared (i.e. set to zero), otherwise + * the counters are not modified. When the read operation is enabled (*stats* + * is non-NULL), the clear operation is performed after the read operation is + * completed. + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_stats_read(struct rte_table_action *action, + void *data, + struct rte_table_action_stats_counters *stats, + int clear); + +/** + * Table action timestamp read. + * + * @param[in] action + * Handle to table action object (needs to be valid). + * @param[in] data + * Data byte array (typically table rule data) with timestamp action + * previously applied on it. + * @param[inout] timestamp + * Pre-allocated memory where the timestamp read from *data* is saved (has to + * be non-NULL). + * @return + * Zero on success, non-zero error code otherwise. + */ +int __rte_experimental +rte_table_action_time_read(struct rte_table_action *action, + void *data, + uint64_t *timestamp); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_TABLE_ACTION_H__ */ diff --git a/src/spdk/dpdk/lib/librte_port/Makefile b/src/spdk/dpdk/lib/librte_port/Makefile new file mode 100644 index 00000000..8df4864e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/Makefile @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2016 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_port.a +ifeq ($(CONFIG_RTE_PORT_PCAP),y) +LDLIBS += -lpcap +endif +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev +LDLIBS += -lrte_ip_frag -lrte_sched +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +LDLIBS += -lrte_kni +endif + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +EXPORT_MAP := rte_port_version.map + +LIBABIVER := 3 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_ethdev.c +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_ring.c +ifeq ($(CONFIG_RTE_LIBRTE_IP_FRAG),y) +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_frag.c +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_ras.c +endif +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_sched.c +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_fd.c +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_kni.c +endif +SRCS-$(CONFIG_RTE_LIBRTE_PORT) += rte_port_source_sink.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port.h +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_ethdev.h +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_ring.h +ifeq ($(CONFIG_RTE_LIBRTE_IP_FRAG),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_frag.h +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_ras.h +endif +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_sched.h +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_fd.h +ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_kni.h +endif +SYMLINK-$(CONFIG_RTE_LIBRTE_PORT)-include += rte_port_source_sink.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_port/meson.build b/src/spdk/dpdk/lib/librte_port/meson.build new file mode 100644 index 00000000..f3d8b443 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/meson.build @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 3 +sources = files( + 'rte_port_ethdev.c', + 'rte_port_fd.c', + 'rte_port_frag.c', + 'rte_port_ras.c', + 'rte_port_ring.c', + 'rte_port_sched.c', + 'rte_port_source_sink.c') +headers = files( + 'rte_port_ethdev.h', + 'rte_port_fd.h', + 'rte_port_frag.h', + 'rte_port_ras.h', + 'rte_port.h', + 'rte_port_ring.h', + 'rte_port_sched.h', + 'rte_port_source_sink.h') +deps += ['ethdev', 'sched', 'ip_frag'] + +if dpdk_conf.has('RTE_LIBRTE_KNI') + sources += files('rte_port_kni.c') + headers += files('rte_port_kni.h') + deps += 'kni' +endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port.h b/src/spdk/dpdk/lib/librte_port/rte_port.h new file mode 100644 index 00000000..7f156ef4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_H__ +#define __INCLUDE_RTE_PORT_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port + * + * This tool is part of the DPDK Packet Framework tool suite and provides + * a standard interface to implement different types of packet ports. + * + ***/ + +#include +#include + +/**@{ + * Macros to allow accessing metadata stored in the mbuf headroom + * just beyond the end of the mbuf data structure returned by a port + */ +#define RTE_MBUF_METADATA_UINT8_PTR(mbuf, offset) \ + (&((uint8_t *)(mbuf))[offset]) +#define RTE_MBUF_METADATA_UINT16_PTR(mbuf, offset) \ + ((uint16_t *) RTE_MBUF_METADATA_UINT8_PTR(mbuf, offset)) +#define RTE_MBUF_METADATA_UINT32_PTR(mbuf, offset) \ + ((uint32_t *) RTE_MBUF_METADATA_UINT8_PTR(mbuf, offset)) +#define RTE_MBUF_METADATA_UINT64_PTR(mbuf, offset) \ + ((uint64_t *) RTE_MBUF_METADATA_UINT8_PTR(mbuf, offset)) + +#define RTE_MBUF_METADATA_UINT8(mbuf, offset) \ + (*RTE_MBUF_METADATA_UINT8_PTR(mbuf, offset)) +#define RTE_MBUF_METADATA_UINT16(mbuf, offset) \ + (*RTE_MBUF_METADATA_UINT16_PTR(mbuf, offset)) +#define RTE_MBUF_METADATA_UINT32(mbuf, offset) \ + (*RTE_MBUF_METADATA_UINT32_PTR(mbuf, offset)) +#define RTE_MBUF_METADATA_UINT64(mbuf, offset) \ + (*RTE_MBUF_METADATA_UINT64_PTR(mbuf, offset)) +/**@}*/ + +/* + * Port IN + * + */ +/** Maximum number of packets read from any input port in a single burst. +Cannot be changed. */ +#define RTE_PORT_IN_BURST_SIZE_MAX 64 + +/** Input port statistics */ +struct rte_port_in_stats { + uint64_t n_pkts_in; + uint64_t n_pkts_drop; +}; + +/** + * Input port create + * + * @param params + * Parameters for input port creation + * @param socket_id + * CPU socket ID (e.g. for memory allocation purpose) + * @return + * Handle to input port instance + */ +typedef void* (*rte_port_in_op_create)(void *params, int socket_id); + +/** + * Input port free + * + * @param port + * Handle to input port instance + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_port_in_op_free)(void *port); + +/** + * Input port packet burst RX + * + * @param port + * Handle to input port instance + * @param pkts + * Burst of input packets + * @param n_pkts + * Number of packets in the input burst + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_port_in_op_rx)( + void *port, + struct rte_mbuf **pkts, + uint32_t n_pkts); + +/** + * Input port stats get + * + * @param port + * Handle to output port instance + * @param stats + * Handle to port_in stats struct to copy data + * @param clear + * Flag indicating that stats should be cleared after read + * + * @return + * Error code or 0 on success. + */ +typedef int (*rte_port_in_op_stats_read)( + void *port, + struct rte_port_in_stats *stats, + int clear); + +/** Input port interface defining the input port operation */ +struct rte_port_in_ops { + rte_port_in_op_create f_create; /**< Create */ + rte_port_in_op_free f_free; /**< Free */ + rte_port_in_op_rx f_rx; /**< Packet RX (packet burst) */ + rte_port_in_op_stats_read f_stats; /**< Stats */ +}; + +/* + * Port OUT + * + */ +/** Output port statistics */ +struct rte_port_out_stats { + uint64_t n_pkts_in; + uint64_t n_pkts_drop; +}; + +/** + * Output port create + * + * @param params + * Parameters for output port creation + * @param socket_id + * CPU socket ID (e.g. for memory allocation purpose) + * @return + * Handle to output port instance + */ +typedef void* (*rte_port_out_op_create)(void *params, int socket_id); + +/** + * Output port free + * + * @param port + * Handle to output port instance + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_port_out_op_free)(void *port); + +/** + * Output port single packet TX + * + * @param port + * Handle to output port instance + * @param pkt + * Input packet + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_port_out_op_tx)( + void *port, + struct rte_mbuf *pkt); + +/** + * Output port packet burst TX + * + * @param port + * Handle to output port instance + * @param pkts + * Burst of input packets specified as array of up to 64 pointers to struct + * rte_mbuf + * @param pkts_mask + * 64-bit bitmask specifying which packets in the input burst are valid. When + * pkts_mask bit n is set, then element n of pkts array is pointing to a + * valid packet. Otherwise, element n of pkts array will not be accessed. + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_port_out_op_tx_bulk)( + void *port, + struct rte_mbuf **pkt, + uint64_t pkts_mask); + +/** + * Output port flush + * + * @param port + * Handle to output port instance + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_port_out_op_flush)(void *port); + +/** + * Output port stats read + * + * @param port + * Handle to output port instance + * @param stats + * Handle to port_out stats struct to copy data + * @param clear + * Flag indicating that stats should be cleared after read + * + * @return + * Error code or 0 on success. + */ +typedef int (*rte_port_out_op_stats_read)( + void *port, + struct rte_port_out_stats *stats, + int clear); + +/** Output port interface defining the output port operation */ +struct rte_port_out_ops { + rte_port_out_op_create f_create; /**< Create */ + rte_port_out_op_free f_free; /**< Free */ + rte_port_out_op_tx f_tx; /**< Packet TX (single packet) */ + rte_port_out_op_tx_bulk f_tx_bulk; /**< Packet TX (packet burst) */ + rte_port_out_op_flush f_flush; /**< Flush */ + rte_port_out_op_stats_read f_stats; /**< Stats */ +}; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_ethdev.c b/src/spdk/dpdk/lib/librte_port/rte_port_ethdev.c new file mode 100644 index 00000000..0da78902 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_ethdev.c @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include + +#include +#include +#include + +#include "rte_port_ethdev.h" + +/* + * Port ETHDEV Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_ETHDEV_READER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_ETHDEV_READER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_ETHDEV_READER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_ETHDEV_READER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ethdev_reader { + struct rte_port_in_stats stats; + + uint16_t queue_id; + uint16_t port_id; +}; + +static void * +rte_port_ethdev_reader_create(void *params, int socket_id) +{ + struct rte_port_ethdev_reader_params *conf = + params; + struct rte_port_ethdev_reader *port; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: params is NULL\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->port_id = conf->port_id; + port->queue_id = conf->queue_id; + + return port; +} + +static int +rte_port_ethdev_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_ethdev_reader *p = + port; + uint16_t rx_pkt_cnt; + + rx_pkt_cnt = rte_eth_rx_burst(p->port_id, p->queue_id, pkts, n_pkts); + RTE_PORT_ETHDEV_READER_STATS_PKTS_IN_ADD(p, rx_pkt_cnt); + return rx_pkt_cnt; +} + +static int +rte_port_ethdev_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int rte_port_ethdev_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_ethdev_reader *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port ETHDEV Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_ETHDEV_WRITER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_ETHDEV_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_ETHDEV_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_ETHDEV_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ethdev_writer { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint16_t tx_buf_count; + uint64_t bsz_mask; + uint16_t queue_id; + uint16_t port_id; +}; + +static void * +rte_port_ethdev_writer_create(void *params, int socket_id) +{ + struct rte_port_ethdev_writer_params *conf = + params; + struct rte_port_ethdev_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->port_id = conf->port_id; + port->queue_id = conf->queue_id; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + return port; +} + +static inline void +send_burst(struct rte_port_ethdev_writer *p) +{ + uint32_t nb_tx; + + nb_tx = rte_eth_tx_burst(p->port_id, p->queue_id, + p->tx_buf, p->tx_buf_count); + + RTE_PORT_ETHDEV_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_ethdev_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ethdev_writer *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_ETHDEV_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_ethdev_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_ethdev_writer *p = + port; + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) + send_burst(p); + + RTE_PORT_ETHDEV_WRITER_STATS_PKTS_IN_ADD(p, n_pkts); + n_pkts_ok = rte_eth_tx_burst(p->port_id, p->queue_id, pkts, + n_pkts); + + RTE_PORT_ETHDEV_WRITER_STATS_PKTS_DROP_ADD(p, n_pkts - n_pkts_ok); + for ( ; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + + rte_pktmbuf_free(pkt); + } + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_ETHDEV_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst(p); + } + + return 0; +} + +static int +rte_port_ethdev_writer_flush(void *port) +{ + struct rte_port_ethdev_writer *p = + port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_ethdev_writer_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_ethdev_writer_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_ethdev_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_ethdev_writer *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port ETHDEV Writer Nodrop + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ethdev_writer_nodrop { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint16_t tx_buf_count; + uint64_t bsz_mask; + uint64_t n_retries; + uint16_t queue_id; + uint16_t port_id; +}; + +static void * +rte_port_ethdev_writer_nodrop_create(void *params, int socket_id) +{ + struct rte_port_ethdev_writer_nodrop_params *conf = + params; + struct rte_port_ethdev_writer_nodrop *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->port_id = conf->port_id; + port->queue_id = conf->queue_id; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + /* + * When n_retries is 0 it means that we should wait for every packet to + * send no matter how many retries should it take. To limit number of + * branches in fast path, we use UINT64_MAX instead of branching. + */ + port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries; + + return port; +} + +static inline void +send_burst_nodrop(struct rte_port_ethdev_writer_nodrop *p) +{ + uint32_t nb_tx = 0, i; + + nb_tx = rte_eth_tx_burst(p->port_id, p->queue_id, p->tx_buf, + p->tx_buf_count); + + /* We sent all the packets in a first try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + + for (i = 0; i < p->n_retries; i++) { + nb_tx += rte_eth_tx_burst(p->port_id, p->queue_id, + p->tx_buf + nb_tx, p->tx_buf_count - nb_tx); + + /* We sent all the packets in more than one try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + } + + /* We didn't send the packets in maximum allowed attempts */ + RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_ethdev_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ethdev_writer_nodrop *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_ethdev_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_ethdev_writer_nodrop *p = + port; + + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) + send_burst_nodrop(p); + + RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts); + n_pkts_ok = rte_eth_tx_burst(p->port_id, p->queue_id, pkts, + n_pkts); + + if (n_pkts_ok >= n_pkts) + return 0; + + /* + * If we did not manage to send all packets in single burst, + * move remaining packets to the buffer and call send burst. + */ + for (; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + p->tx_buf[p->tx_buf_count++] = pkt; + } + send_burst_nodrop(p); + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_ETHDEV_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + } + + return 0; +} + +static int +rte_port_ethdev_writer_nodrop_flush(void *port) +{ + struct rte_port_ethdev_writer_nodrop *p = + port; + + if (p->tx_buf_count > 0) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_ethdev_writer_nodrop_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_ethdev_writer_nodrop_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_ethdev_writer_nodrop_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_ethdev_writer_nodrop *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_ethdev_reader_ops = { + .f_create = rte_port_ethdev_reader_create, + .f_free = rte_port_ethdev_reader_free, + .f_rx = rte_port_ethdev_reader_rx, + .f_stats = rte_port_ethdev_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_ethdev_writer_ops = { + .f_create = rte_port_ethdev_writer_create, + .f_free = rte_port_ethdev_writer_free, + .f_tx = rte_port_ethdev_writer_tx, + .f_tx_bulk = rte_port_ethdev_writer_tx_bulk, + .f_flush = rte_port_ethdev_writer_flush, + .f_stats = rte_port_ethdev_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_ethdev_writer_nodrop_ops = { + .f_create = rte_port_ethdev_writer_nodrop_create, + .f_free = rte_port_ethdev_writer_nodrop_free, + .f_tx = rte_port_ethdev_writer_nodrop_tx, + .f_tx_bulk = rte_port_ethdev_writer_nodrop_tx_bulk, + .f_flush = rte_port_ethdev_writer_nodrop_flush, + .f_stats = rte_port_ethdev_writer_nodrop_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_ethdev.h b/src/spdk/dpdk/lib/librte_port/rte_port_ethdev.h new file mode 100644 index 00000000..7f28d512 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_ethdev.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_ETHDEV_H__ +#define __INCLUDE_RTE_PORT_ETHDEV_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port Ethernet Device + * + * ethdev_reader: input port built on top of pre-initialized NIC RX queue + * ethdev_writer: output port built on top of pre-initialized NIC TX queue + * + ***/ + +#include + +#include "rte_port.h" + +/** ethdev_reader port parameters */ +struct rte_port_ethdev_reader_params { + /** NIC RX port ID */ + uint16_t port_id; + + /** NIC RX queue ID */ + uint16_t queue_id; +}; + +/** ethdev_reader port operations */ +extern struct rte_port_in_ops rte_port_ethdev_reader_ops; + +/** ethdev_writer port parameters */ +struct rte_port_ethdev_writer_params { + /** NIC RX port ID */ + uint16_t port_id; + + /** NIC RX queue ID */ + uint16_t queue_id; + + /** Recommended burst size to NIC TX queue. The actual burst size can be + bigger or smaller than this value. */ + uint32_t tx_burst_sz; +}; + +/** ethdev_writer port operations */ +extern struct rte_port_out_ops rte_port_ethdev_writer_ops; + +/** ethdev_writer_nodrop port parameters */ +struct rte_port_ethdev_writer_nodrop_params { + /** NIC RX port ID */ + uint16_t port_id; + + /** NIC RX queue ID */ + uint16_t queue_id; + + /** Recommended burst size to NIC TX queue. The actual burst size can be + bigger or smaller than this value. */ + uint32_t tx_burst_sz; + + /** Maximum number of retries, 0 for no limit */ + uint32_t n_retries; +}; + +/** ethdev_writer_nodrop port operations */ +extern struct rte_port_out_ops rte_port_ethdev_writer_nodrop_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_fd.c b/src/spdk/dpdk/lib/librte_port/rte_port_fd.c new file mode 100644 index 00000000..932ecd32 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_fd.c @@ -0,0 +1,518 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Intel Corporation + */ +#include +#include +#include + +#include +#include + +#include "rte_port_fd.h" + +/* + * Port FD Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_FD_READER_STATS_PKTS_IN_ADD(port, val) \ + do { port->stats.n_pkts_in += val; } while (0) +#define RTE_PORT_FD_READER_STATS_PKTS_DROP_ADD(port, val) \ + do { port->stats.n_pkts_drop += val; } while (0) + +#else + +#define RTE_PORT_FD_READER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_FD_READER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_fd_reader { + struct rte_port_in_stats stats; + int fd; + uint32_t mtu; + struct rte_mempool *mempool; +}; + +static void * +rte_port_fd_reader_create(void *params, int socket_id) +{ + struct rte_port_fd_reader_params *conf = + params; + struct rte_port_fd_reader *port; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: params is NULL\n", __func__); + return NULL; + } + if (conf->fd < 0) { + RTE_LOG(ERR, PORT, "%s: Invalid file descriptor\n", __func__); + return NULL; + } + if (conf->mtu == 0) { + RTE_LOG(ERR, PORT, "%s: Invalid MTU\n", __func__); + return NULL; + } + if (conf->mempool == NULL) { + RTE_LOG(ERR, PORT, "%s: Invalid mempool\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->fd = conf->fd; + port->mtu = conf->mtu; + port->mempool = conf->mempool; + + return port; +} + +static int +rte_port_fd_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_fd_reader *p = port; + uint32_t i, j; + + if (rte_pktmbuf_alloc_bulk(p->mempool, pkts, n_pkts) != 0) + return 0; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + void *pkt_data = rte_pktmbuf_mtod(pkt, void *); + ssize_t n_bytes; + + n_bytes = read(p->fd, pkt_data, (size_t) p->mtu); + if (n_bytes <= 0) + break; + + pkt->data_len = n_bytes; + pkt->pkt_len = n_bytes; + } + + for (j = i; j < n_pkts; j++) + rte_pktmbuf_free(pkts[j]); + + RTE_PORT_FD_READER_STATS_PKTS_IN_ADD(p, i); + + return i; +} + +static int +rte_port_fd_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int rte_port_fd_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_fd_reader *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port FD Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_FD_WRITER_STATS_PKTS_IN_ADD(port, val) \ + do { port->stats.n_pkts_in += val; } while (0) +#define RTE_PORT_FD_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + do { port->stats.n_pkts_drop += val; } while (0) + +#else + +#define RTE_PORT_FD_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_FD_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_fd_writer { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint16_t tx_buf_count; + uint32_t fd; +}; + +static void * +rte_port_fd_writer_create(void *params, int socket_id) +{ + struct rte_port_fd_writer_params *conf = + params; + struct rte_port_fd_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->fd = conf->fd; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + + return port; +} + +static inline void +send_burst(struct rte_port_fd_writer *p) +{ + uint32_t i; + + for (i = 0; i < p->tx_buf_count; i++) { + struct rte_mbuf *pkt = p->tx_buf[i]; + void *pkt_data = rte_pktmbuf_mtod(pkt, void*); + size_t n_bytes = rte_pktmbuf_data_len(pkt); + ssize_t ret; + + ret = write(p->fd, pkt_data, n_bytes); + if (ret < 0) + break; + } + + RTE_PORT_FD_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - i); + + for (i = 0; i < p->tx_buf_count; i++) + rte_pktmbuf_free(p->tx_buf[i]); + + p->tx_buf_count = 0; +} + +static int +rte_port_fd_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_fd_writer *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_FD_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_fd_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_fd_writer *p = + port; + uint32_t tx_buf_count = p->tx_buf_count; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) + p->tx_buf[tx_buf_count++] = pkts[i]; + RTE_PORT_FD_WRITER_STATS_PKTS_IN_ADD(p, n_pkts); + } else + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_FD_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_fd_writer_flush(void *port) +{ + struct rte_port_fd_writer *p = + port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_fd_writer_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_fd_writer_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_fd_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_fd_writer *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port FD Writer Nodrop + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \ + do { port->stats.n_pkts_in += val; } while (0) +#define RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \ + do { port->stats.n_pkts_drop += val; } while (0) + +#else + +#define RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_fd_writer_nodrop { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint16_t tx_buf_count; + uint64_t n_retries; + uint32_t fd; +}; + +static void * +rte_port_fd_writer_nodrop_create(void *params, int socket_id) +{ + struct rte_port_fd_writer_nodrop_params *conf = + params; + struct rte_port_fd_writer_nodrop *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->fd < 0) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->fd = conf->fd; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + + /* + * When n_retries is 0 it means that we should wait for every packet to + * send no matter how many retries should it take. To limit number of + * branches in fast path, we use UINT64_MAX instead of branching. + */ + port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries; + + return port; +} + +static inline void +send_burst_nodrop(struct rte_port_fd_writer_nodrop *p) +{ + uint64_t n_retries; + uint32_t i; + + n_retries = 0; + for (i = 0; (i < p->tx_buf_count) && (n_retries < p->n_retries); i++) { + struct rte_mbuf *pkt = p->tx_buf[i]; + void *pkt_data = rte_pktmbuf_mtod(pkt, void*); + size_t n_bytes = rte_pktmbuf_data_len(pkt); + + for ( ; n_retries < p->n_retries; n_retries++) { + ssize_t ret; + + ret = write(p->fd, pkt_data, n_bytes); + if (ret) + break; + } + } + + RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - i); + + for (i = 0; i < p->tx_buf_count; i++) + rte_pktmbuf_free(p->tx_buf[i]); + + p->tx_buf_count = 0; +} + +static int +rte_port_fd_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_fd_writer_nodrop *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_fd_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_fd_writer_nodrop *p = + port; + uint32_t tx_buf_count = p->tx_buf_count; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) + p->tx_buf[tx_buf_count++] = pkts[i]; + RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts); + } else + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_FD_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_fd_writer_nodrop_flush(void *port) +{ + struct rte_port_fd_writer_nodrop *p = + port; + + if (p->tx_buf_count > 0) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_fd_writer_nodrop_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_fd_writer_nodrop_flush(port); + rte_free(port); + +return 0; +} + +static int rte_port_fd_writer_nodrop_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_fd_writer_nodrop *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_fd_reader_ops = { + .f_create = rte_port_fd_reader_create, + .f_free = rte_port_fd_reader_free, + .f_rx = rte_port_fd_reader_rx, + .f_stats = rte_port_fd_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_fd_writer_ops = { + .f_create = rte_port_fd_writer_create, + .f_free = rte_port_fd_writer_free, + .f_tx = rte_port_fd_writer_tx, + .f_tx_bulk = rte_port_fd_writer_tx_bulk, + .f_flush = rte_port_fd_writer_flush, + .f_stats = rte_port_fd_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_fd_writer_nodrop_ops = { + .f_create = rte_port_fd_writer_nodrop_create, + .f_free = rte_port_fd_writer_nodrop_free, + .f_tx = rte_port_fd_writer_nodrop_tx, + .f_tx_bulk = rte_port_fd_writer_nodrop_tx_bulk, + .f_flush = rte_port_fd_writer_nodrop_flush, + .f_stats = rte_port_fd_writer_nodrop_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_fd.h b/src/spdk/dpdk/lib/librte_port/rte_port_fd.h new file mode 100644 index 00000000..e7620ef5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_fd.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_FD_H__ +#define __INCLUDE_RTE_PORT_FD_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port FD Device + * + * fd_reader: input port built on top of valid non-blocking file descriptor + * fd_writer: output port built on top of valid non-blocking file descriptor + * + ***/ + +#include + +#include +#include "rte_port.h" + +/** fd_reader port parameters */ +struct rte_port_fd_reader_params { + /** File descriptor */ + int fd; + + /** Maximum Transfer Unit (MTU) */ + uint32_t mtu; + + /** Pre-initialized buffer pool */ + struct rte_mempool *mempool; +}; + +/** fd_reader port operations */ +extern struct rte_port_in_ops rte_port_fd_reader_ops; + +/** fd_writer port parameters */ +struct rte_port_fd_writer_params { + /** File descriptor */ + int fd; + + /**< Recommended write burst size. The actual burst size can be + * bigger or smaller than this value. + */ + uint32_t tx_burst_sz; +}; + +/** fd_writer port operations */ +extern struct rte_port_out_ops rte_port_fd_writer_ops; + +/** fd_writer_nodrop port parameters */ +struct rte_port_fd_writer_nodrop_params { + /** File descriptor */ + int fd; + + /**< Recommended write burst size. The actual burst size can be + * bigger or smaller than this value. + */ + uint32_t tx_burst_sz; + + /** Maximum number of retries, 0 for no limit */ + uint32_t n_retries; +}; + +/** fd_writer_nodrop port operations */ +extern struct rte_port_out_ops rte_port_fd_writer_nodrop_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_frag.c b/src/spdk/dpdk/lib/librte_port/rte_port_frag.c new file mode 100644 index 00000000..8a803fda --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_frag.c @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include + +#include +#include +#include + +#include "rte_port_frag.h" + +/* Max number of fragments per packet allowed */ +#define RTE_PORT_FRAG_MAX_FRAGS_PER_PACKET 0x80 + +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_RING_READER_FRAG_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_RING_READER_FRAG_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_RING_READER_FRAG_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_RING_READER_FRAG_STATS_PKTS_DROP_ADD(port, val) + +#endif + +typedef int32_t + (*frag_op)(struct rte_mbuf *pkt_in, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out, + uint16_t mtu_size, + struct rte_mempool *pool_direct, + struct rte_mempool *pool_indirect); + +struct rte_port_ring_reader_frag { + struct rte_port_in_stats stats; + + /* Input parameters */ + struct rte_ring *ring; + uint32_t mtu; + uint32_t metadata_size; + struct rte_mempool *pool_direct; + struct rte_mempool *pool_indirect; + + /* Internal buffers */ + struct rte_mbuf *pkts[RTE_PORT_IN_BURST_SIZE_MAX]; + struct rte_mbuf *frags[RTE_PORT_FRAG_MAX_FRAGS_PER_PACKET]; + uint32_t n_pkts; + uint32_t pos_pkts; + uint32_t n_frags; + uint32_t pos_frags; + + frag_op f_frag; +} __rte_cache_aligned; + +static void * +rte_port_ring_reader_frag_create(void *params, int socket_id, int is_ipv4) +{ + struct rte_port_ring_reader_frag_params *conf = + params; + struct rte_port_ring_reader_frag *port; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter conf is NULL\n", __func__); + return NULL; + } + if (conf->ring == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter ring is NULL\n", __func__); + return NULL; + } + if (conf->mtu == 0) { + RTE_LOG(ERR, PORT, "%s: Parameter mtu is invalid\n", __func__); + return NULL; + } + if (conf->pool_direct == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter pool_direct is NULL\n", + __func__); + return NULL; + } + if (conf->pool_indirect == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter pool_indirect is NULL\n", + __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), RTE_CACHE_LINE_SIZE, + socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return NULL; + } + + /* Initialization */ + port->ring = conf->ring; + port->mtu = conf->mtu; + port->metadata_size = conf->metadata_size; + port->pool_direct = conf->pool_direct; + port->pool_indirect = conf->pool_indirect; + + port->n_pkts = 0; + port->pos_pkts = 0; + port->n_frags = 0; + port->pos_frags = 0; + + port->f_frag = (is_ipv4) ? + rte_ipv4_fragment_packet : rte_ipv6_fragment_packet; + + return port; +} + +static void * +rte_port_ring_reader_ipv4_frag_create(void *params, int socket_id) +{ + return rte_port_ring_reader_frag_create(params, socket_id, 1); +} + +static void * +rte_port_ring_reader_ipv6_frag_create(void *params, int socket_id) +{ + return rte_port_ring_reader_frag_create(params, socket_id, 0); +} + +static int +rte_port_ring_reader_frag_rx(void *port, + struct rte_mbuf **pkts, + uint32_t n_pkts) +{ + struct rte_port_ring_reader_frag *p = + port; + uint32_t n_pkts_out; + + n_pkts_out = 0; + + /* Get packets from the "frag" buffer */ + if (p->n_frags >= n_pkts) { + memcpy(pkts, &p->frags[p->pos_frags], n_pkts * sizeof(void *)); + p->pos_frags += n_pkts; + p->n_frags -= n_pkts; + + return n_pkts; + } + + memcpy(pkts, &p->frags[p->pos_frags], p->n_frags * sizeof(void *)); + n_pkts_out = p->n_frags; + p->n_frags = 0; + + /* Look to "pkts" buffer to get more packets */ + for ( ; ; ) { + struct rte_mbuf *pkt; + uint32_t n_pkts_to_provide, i; + int status; + + /* If "pkts" buffer is empty, read packet burst from ring */ + if (p->n_pkts == 0) { + p->n_pkts = rte_ring_sc_dequeue_burst(p->ring, + (void **) p->pkts, RTE_PORT_IN_BURST_SIZE_MAX, + NULL); + RTE_PORT_RING_READER_FRAG_STATS_PKTS_IN_ADD(p, p->n_pkts); + if (p->n_pkts == 0) + return n_pkts_out; + p->pos_pkts = 0; + } + + /* Read next packet from "pkts" buffer */ + pkt = p->pkts[p->pos_pkts++]; + p->n_pkts--; + + /* If not jumbo, pass current packet to output */ + if (pkt->pkt_len <= p->mtu) { + pkts[n_pkts_out++] = pkt; + + n_pkts_to_provide = n_pkts - n_pkts_out; + if (n_pkts_to_provide == 0) + return n_pkts; + + continue; + } + + /* Fragment current packet into the "frags" buffer */ + status = p->f_frag( + pkt, + p->frags, + RTE_PORT_FRAG_MAX_FRAGS_PER_PACKET, + p->mtu, + p->pool_direct, + p->pool_indirect + ); + + if (status < 0) { + rte_pktmbuf_free(pkt); + RTE_PORT_RING_READER_FRAG_STATS_PKTS_DROP_ADD(p, 1); + continue; + } + + p->n_frags = (uint32_t) status; + p->pos_frags = 0; + + /* Copy meta-data from input jumbo packet to its fragments */ + for (i = 0; i < p->n_frags; i++) { + uint8_t *src = + RTE_MBUF_METADATA_UINT8_PTR(pkt, sizeof(struct rte_mbuf)); + uint8_t *dst = + RTE_MBUF_METADATA_UINT8_PTR(p->frags[i], sizeof(struct rte_mbuf)); + + memcpy(dst, src, p->metadata_size); + } + + /* Free input jumbo packet */ + rte_pktmbuf_free(pkt); + + /* Get packets from "frag" buffer */ + n_pkts_to_provide = n_pkts - n_pkts_out; + if (p->n_frags >= n_pkts_to_provide) { + memcpy(&pkts[n_pkts_out], p->frags, + n_pkts_to_provide * sizeof(void *)); + p->n_frags -= n_pkts_to_provide; + p->pos_frags += n_pkts_to_provide; + + return n_pkts; + } + + memcpy(&pkts[n_pkts_out], p->frags, + p->n_frags * sizeof(void *)); + n_pkts_out += p->n_frags; + p->n_frags = 0; + } +} + +static int +rte_port_ring_reader_frag_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter port is NULL\n", __func__); + return -1; + } + + rte_free(port); + + return 0; +} + +static int +rte_port_frag_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_ring_reader_frag *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_ring_reader_ipv4_frag_ops = { + .f_create = rte_port_ring_reader_ipv4_frag_create, + .f_free = rte_port_ring_reader_frag_free, + .f_rx = rte_port_ring_reader_frag_rx, + .f_stats = rte_port_frag_reader_stats_read, +}; + +struct rte_port_in_ops rte_port_ring_reader_ipv6_frag_ops = { + .f_create = rte_port_ring_reader_ipv6_frag_create, + .f_free = rte_port_ring_reader_frag_free, + .f_rx = rte_port_ring_reader_frag_rx, + .f_stats = rte_port_frag_reader_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_frag.h b/src/spdk/dpdk/lib/librte_port/rte_port_frag.h new file mode 100644 index 00000000..74321e4c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_frag.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_IP_FRAG_H__ +#define __INCLUDE_RTE_PORT_IP_FRAG_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port for IPv4 Fragmentation + * + * This port is built on top of pre-initialized single consumer rte_ring. In + * order to minimize the amount of packets stored in the ring at any given + * time, the IP fragmentation functionality is executed on ring read operation, + * hence this port is implemented as an input port. A regular ring_writer port + * can be created to write to the same ring. + * + * The packets written to the ring are either complete IP datagrams or jumbo + * frames (i.e. IP packets with length bigger than provided MTU value). The + * packets read from the ring are all non-jumbo frames. The complete IP + * datagrams written to the ring are not changed. The jumbo frames are + * fragmented into several IP packets with length less or equal to MTU. + * + ***/ + +#include + +#include + +#include "rte_port.h" + +/** ring_reader_ipv4_frag port parameters */ +struct rte_port_ring_reader_frag_params { + /** Underlying single consumer ring that has to be pre-initialized. */ + struct rte_ring *ring; + + /** Maximum Transfer Unit (MTU). Maximum IP packet size (in bytes). */ + uint32_t mtu; + + /** Size of application dependent meta-data stored per each input packet + that has to be copied to each of the fragments originating from the + same input IP datagram. */ + uint32_t metadata_size; + + /** Pre-initialized buffer pool used for allocating direct buffers for + the output fragments. */ + struct rte_mempool *pool_direct; + + /** Pre-initialized buffer pool used for allocating indirect buffers for + the output fragments. */ + struct rte_mempool *pool_indirect; +}; + +#define rte_port_ring_reader_ipv4_frag_params rte_port_ring_reader_frag_params + +#define rte_port_ring_reader_ipv6_frag_params rte_port_ring_reader_frag_params + +/** ring_reader_ipv4_frag port operations */ +extern struct rte_port_in_ops rte_port_ring_reader_ipv4_frag_ops; + +/** ring_reader_ipv6_frag port operations */ +extern struct rte_port_in_ops rte_port_ring_reader_ipv6_frag_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_kni.c b/src/spdk/dpdk/lib/librte_port/rte_port_kni.c new file mode 100644 index 00000000..2515fb2a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_kni.c @@ -0,0 +1,545 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Ethan Zhuang . + * Copyright(c) 2016 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include + +#include +#include +#include + +#include "rte_port_kni.h" + +/* + * Port KNI Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_KNI_READER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_KNI_READER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_KNI_READER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_KNI_READER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_kni_reader { + struct rte_port_in_stats stats; + + struct rte_kni *kni; +}; + +static void * +rte_port_kni_reader_create(void *params, int socket_id) +{ + struct rte_port_kni_reader_params *conf = + params; + struct rte_port_kni_reader *port; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: params is NULL\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->kni = conf->kni; + + return port; +} + +static int +rte_port_kni_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_kni_reader *p = + port; + uint16_t rx_pkt_cnt; + + rx_pkt_cnt = rte_kni_rx_burst(p->kni, pkts, n_pkts); + RTE_PORT_KNI_READER_STATS_PKTS_IN_ADD(p, rx_pkt_cnt); + return rx_pkt_cnt; +} + +static int +rte_port_kni_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int rte_port_kni_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_kni_reader *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port KNI Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_kni_writer { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + struct rte_kni *kni; +}; + +static void * +rte_port_kni_writer_create(void *params, int socket_id) +{ + struct rte_port_kni_writer_params *conf = + params; + struct rte_port_kni_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->kni = conf->kni; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + return port; +} + +static inline void +send_burst(struct rte_port_kni_writer *p) +{ + uint32_t nb_tx; + + nb_tx = rte_kni_tx_burst(p->kni, p->tx_buf, p->tx_buf_count); + + RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for (; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_kni_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_kni_writer *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_kni_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_kni_writer *p = + port; + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) + send_burst(p); + + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, n_pkts); + n_pkts_ok = rte_kni_tx_burst(p->kni, pkts, n_pkts); + + RTE_PORT_KNI_WRITER_STATS_PKTS_DROP_ADD(p, n_pkts - n_pkts_ok); + for (; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + + rte_pktmbuf_free(pkt); + } + } else { + for (; pkts_mask;) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst(p); + } + + return 0; +} + +static int +rte_port_kni_writer_flush(void *port) +{ + struct rte_port_kni_writer *p = + port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_kni_writer_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_kni_writer_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_kni_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_kni_writer *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port KNI Writer Nodrop + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_kni_writer_nodrop { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + uint64_t n_retries; + struct rte_kni *kni; +}; + +static void * +rte_port_kni_writer_nodrop_create(void *params, int socket_id) +{ + struct rte_port_kni_writer_nodrop_params *conf = + params; + struct rte_port_kni_writer_nodrop *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid input parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->kni = conf->kni; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + /* + * When n_retries is 0 it means that we should wait for every packet to + * send no matter how many retries should it take. To limit number of + * branches in fast path, we use UINT64_MAX instead of branching. + */ + port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries; + + return port; +} + +static inline void +send_burst_nodrop(struct rte_port_kni_writer_nodrop *p) +{ + uint32_t nb_tx = 0, i; + + nb_tx = rte_kni_tx_burst(p->kni, p->tx_buf, p->tx_buf_count); + + /* We sent all the packets in a first try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + + for (i = 0; i < p->n_retries; i++) { + nb_tx += rte_kni_tx_burst(p->kni, + p->tx_buf + nb_tx, + p->tx_buf_count - nb_tx); + + /* We sent all the packets in more than one try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + } + + /* We didn't send the packets in maximum allowed attempts */ + RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_kni_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_kni_writer_nodrop *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_kni_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_kni_writer_nodrop *p = + port; + + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) + send_burst_nodrop(p); + + RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts); + n_pkts_ok = rte_kni_tx_burst(p->kni, pkts, n_pkts); + + if (n_pkts_ok >= n_pkts) + return 0; + + /* + * If we didn't manage to send all packets in single burst, move + * remaining packets to the buffer and call send burst. + */ + for (; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + p->tx_buf[p->tx_buf_count++] = pkt; + } + send_burst_nodrop(p); + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_KNI_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + } + + return 0; +} + +static int +rte_port_kni_writer_nodrop_flush(void *port) +{ + struct rte_port_kni_writer_nodrop *p = + port; + + if (p->tx_buf_count > 0) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_kni_writer_nodrop_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_kni_writer_nodrop_flush(port); + rte_free(port); + + return 0; +} + +static int rte_port_kni_writer_nodrop_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_kni_writer_nodrop *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_kni_reader_ops = { + .f_create = rte_port_kni_reader_create, + .f_free = rte_port_kni_reader_free, + .f_rx = rte_port_kni_reader_rx, + .f_stats = rte_port_kni_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_kni_writer_ops = { + .f_create = rte_port_kni_writer_create, + .f_free = rte_port_kni_writer_free, + .f_tx = rte_port_kni_writer_tx, + .f_tx_bulk = rte_port_kni_writer_tx_bulk, + .f_flush = rte_port_kni_writer_flush, + .f_stats = rte_port_kni_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_kni_writer_nodrop_ops = { + .f_create = rte_port_kni_writer_nodrop_create, + .f_free = rte_port_kni_writer_nodrop_free, + .f_tx = rte_port_kni_writer_nodrop_tx, + .f_tx_bulk = rte_port_kni_writer_nodrop_tx_bulk, + .f_flush = rte_port_kni_writer_nodrop_flush, + .f_stats = rte_port_kni_writer_nodrop_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_kni.h b/src/spdk/dpdk/lib/librte_port/rte_port_kni.h new file mode 100644 index 00000000..4b60689c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_kni.h @@ -0,0 +1,95 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Ethan Zhuang . + * Copyright(c) 2016 Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __INCLUDE_RTE_PORT_KNI_H__ +#define __INCLUDE_RTE_PORT_KNI_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port KNI Interface + * + * kni_reader: input port built on top of pre-initialized KNI interface + * kni_writer: output port built on top of pre-initialized KNI interface + * + ***/ + +#include + +#include + +#include "rte_port.h" + +/** kni_reader port parameters */ +struct rte_port_kni_reader_params { + /** KNI interface reference */ + struct rte_kni *kni; +}; + +/** kni_reader port operations */ +extern struct rte_port_in_ops rte_port_kni_reader_ops; + + +/** kni_writer port parameters */ +struct rte_port_kni_writer_params { + /** KNI interface reference */ + struct rte_kni *kni; + /** Burst size to KNI interface. */ + uint32_t tx_burst_sz; +}; + +/** kni_writer port operations */ +extern struct rte_port_out_ops rte_port_kni_writer_ops; + +/** kni_writer_nodrop port parameters */ +struct rte_port_kni_writer_nodrop_params { + /** KNI interface reference */ + struct rte_kni *kni; + /** Burst size to KNI interface. */ + uint32_t tx_burst_sz; + /** Maximum number of retries, 0 for no limit */ + uint32_t n_retries; +}; + +/** kni_writer_nodrop port operations */ +extern struct rte_port_out_ops rte_port_kni_writer_nodrop_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_ras.c b/src/spdk/dpdk/lib/librte_port/rte_port_ras.c new file mode 100644 index 00000000..c8b2e19b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_ras.c @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include + +#include +#include +#include +#include + +#include "rte_port_ras.h" + +#ifndef RTE_PORT_RAS_N_BUCKETS +#define RTE_PORT_RAS_N_BUCKETS 4094 +#endif + +#ifndef RTE_PORT_RAS_N_ENTRIES_PER_BUCKET +#define RTE_PORT_RAS_N_ENTRIES_PER_BUCKET 8 +#endif + +#ifndef RTE_PORT_RAS_N_ENTRIES +#define RTE_PORT_RAS_N_ENTRIES (RTE_PORT_RAS_N_BUCKETS * RTE_PORT_RAS_N_ENTRIES_PER_BUCKET) +#endif + +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_RING_WRITER_RAS_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_RING_WRITER_RAS_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_RING_WRITER_RAS_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_RING_WRITER_RAS_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ring_writer_ras; + +typedef void (*ras_op)( + struct rte_port_ring_writer_ras *p, + struct rte_mbuf *pkt); + +static void +process_ipv4(struct rte_port_ring_writer_ras *p, struct rte_mbuf *pkt); +static void +process_ipv6(struct rte_port_ring_writer_ras *p, struct rte_mbuf *pkt); + +struct rte_port_ring_writer_ras { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[RTE_PORT_IN_BURST_SIZE_MAX]; + struct rte_ring *ring; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + struct rte_ip_frag_tbl *frag_tbl; + struct rte_ip_frag_death_row death_row; + + ras_op f_ras; +}; + +static void * +rte_port_ring_writer_ras_create(void *params, int socket_id, int is_ipv4) +{ + struct rte_port_ring_writer_ras_params *conf = + params; + struct rte_port_ring_writer_ras *port; + uint64_t frag_cycles; + + /* Check input parameters */ + if (conf == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter conf is NULL\n", __func__); + return NULL; + } + if (conf->ring == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter ring is NULL\n", __func__); + return NULL; + } + if ((conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX)) { + RTE_LOG(ERR, PORT, "%s: Parameter tx_burst_sz is invalid\n", + __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate socket\n", __func__); + return NULL; + } + + /* Create fragmentation table */ + frag_cycles = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S * MS_PER_S; + frag_cycles *= 100; + + port->frag_tbl = rte_ip_frag_table_create( + RTE_PORT_RAS_N_BUCKETS, + RTE_PORT_RAS_N_ENTRIES_PER_BUCKET, + RTE_PORT_RAS_N_ENTRIES, + frag_cycles, + socket_id); + + if (port->frag_tbl == NULL) { + RTE_LOG(ERR, PORT, "%s: rte_ip_frag_table_create failed\n", + __func__); + rte_free(port); + return NULL; + } + + /* Initialization */ + port->ring = conf->ring; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + + port->f_ras = (is_ipv4 == 1) ? process_ipv4 : process_ipv6; + + return port; +} + +static void * +rte_port_ring_writer_ipv4_ras_create(void *params, int socket_id) +{ + return rte_port_ring_writer_ras_create(params, socket_id, 1); +} + +static void * +rte_port_ring_writer_ipv6_ras_create(void *params, int socket_id) +{ + return rte_port_ring_writer_ras_create(params, socket_id, 0); +} + +static inline void +send_burst(struct rte_port_ring_writer_ras *p) +{ + uint32_t nb_tx; + + nb_tx = rte_ring_sp_enqueue_burst(p->ring, (void **)p->tx_buf, + p->tx_buf_count, NULL); + + RTE_PORT_RING_WRITER_RAS_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static void +process_ipv4(struct rte_port_ring_writer_ras *p, struct rte_mbuf *pkt) +{ + /* Assume there is no ethernet header */ + struct ipv4_hdr *pkt_hdr = rte_pktmbuf_mtod(pkt, struct ipv4_hdr *); + + /* Get "More fragments" flag and fragment offset */ + uint16_t frag_field = rte_be_to_cpu_16(pkt_hdr->fragment_offset); + uint16_t frag_offset = (uint16_t)(frag_field & IPV4_HDR_OFFSET_MASK); + uint16_t frag_flag = (uint16_t)(frag_field & IPV4_HDR_MF_FLAG); + + /* If it is a fragmented packet, then try to reassemble */ + if ((frag_flag == 0) && (frag_offset == 0)) + p->tx_buf[p->tx_buf_count++] = pkt; + else { + struct rte_mbuf *mo; + struct rte_ip_frag_tbl *tbl = p->frag_tbl; + struct rte_ip_frag_death_row *dr = &p->death_row; + + pkt->l3_len = sizeof(*pkt_hdr); + + /* Process this fragment */ + mo = rte_ipv4_frag_reassemble_packet(tbl, dr, pkt, rte_rdtsc(), + pkt_hdr); + if (mo != NULL) + p->tx_buf[p->tx_buf_count++] = mo; + + rte_ip_frag_free_death_row(&p->death_row, 3); + } +} + +static void +process_ipv6(struct rte_port_ring_writer_ras *p, struct rte_mbuf *pkt) +{ + /* Assume there is no ethernet header */ + struct ipv6_hdr *pkt_hdr = rte_pktmbuf_mtod(pkt, struct ipv6_hdr *); + + struct ipv6_extension_fragment *frag_hdr; + uint16_t frag_data = 0; + frag_hdr = rte_ipv6_frag_get_ipv6_fragment_header(pkt_hdr); + if (frag_hdr != NULL) + frag_data = rte_be_to_cpu_16(frag_hdr->frag_data); + + /* If it is a fragmented packet, then try to reassemble */ + if ((frag_data & RTE_IPV6_FRAG_USED_MASK) == 0) + p->tx_buf[p->tx_buf_count++] = pkt; + else { + struct rte_mbuf *mo; + struct rte_ip_frag_tbl *tbl = p->frag_tbl; + struct rte_ip_frag_death_row *dr = &p->death_row; + + pkt->l3_len = sizeof(*pkt_hdr) + sizeof(*frag_hdr); + + /* Process this fragment */ + mo = rte_ipv6_frag_reassemble_packet(tbl, dr, pkt, rte_rdtsc(), pkt_hdr, + frag_hdr); + if (mo != NULL) + p->tx_buf[p->tx_buf_count++] = mo; + + rte_ip_frag_free_death_row(&p->death_row, 3); + } +} + +static int +rte_port_ring_writer_ras_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ring_writer_ras *p = + port; + + RTE_PORT_RING_WRITER_RAS_STATS_PKTS_IN_ADD(p, 1); + p->f_ras(p, pkt); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_ring_writer_ras_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_ring_writer_ras *p = + port; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + + RTE_PORT_RING_WRITER_RAS_STATS_PKTS_IN_ADD(p, 1); + p->f_ras(p, pkt); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + } + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + RTE_PORT_RING_WRITER_RAS_STATS_PKTS_IN_ADD(p, 1); + p->f_ras(p, pkt); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + pkts_mask &= ~pkt_mask; + } + } + + return 0; +} + +static int +rte_port_ring_writer_ras_flush(void *port) +{ + struct rte_port_ring_writer_ras *p = + port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_ring_writer_ras_free(void *port) +{ + struct rte_port_ring_writer_ras *p = + port; + + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Parameter port is NULL\n", __func__); + return -1; + } + + rte_port_ring_writer_ras_flush(port); + rte_ip_frag_table_destroy(p->frag_tbl); + rte_free(port); + + return 0; +} + +static int +rte_port_ras_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_ring_writer_ras *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_out_ops rte_port_ring_writer_ipv4_ras_ops = { + .f_create = rte_port_ring_writer_ipv4_ras_create, + .f_free = rte_port_ring_writer_ras_free, + .f_tx = rte_port_ring_writer_ras_tx, + .f_tx_bulk = rte_port_ring_writer_ras_tx_bulk, + .f_flush = rte_port_ring_writer_ras_flush, + .f_stats = rte_port_ras_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_ring_writer_ipv6_ras_ops = { + .f_create = rte_port_ring_writer_ipv6_ras_create, + .f_free = rte_port_ring_writer_ras_free, + .f_tx = rte_port_ring_writer_ras_tx, + .f_tx_bulk = rte_port_ring_writer_ras_tx_bulk, + .f_flush = rte_port_ring_writer_ras_flush, + .f_stats = rte_port_ras_writer_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_ras.h b/src/spdk/dpdk/lib/librte_port/rte_port_ras.h new file mode 100644 index 00000000..fa5accdc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_ras.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_RAS_H__ +#define __INCLUDE_RTE_PORT_RAS_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port for IPv4 Reassembly + * + * This port is built on top of pre-initialized single producer rte_ring. In + * order to minimize the amount of packets stored in the ring at any given + * time, the IP reassembly functionality is executed on ring write operation, + * hence this port is implemented as an output port. A regular ring_reader port + * can be created to read from the same ring. + * + * The packets written to the ring are either complete IP datagrams or IP + * fragments. The packets read from the ring are all complete IP datagrams, + * either jumbo frames (i.e. IP packets with length bigger than MTU) or not. + * The complete IP datagrams written to the ring are not changed. The IP + * fragments written to the ring are first reassembled and into complete IP + * datagrams or dropped on error or IP reassembly time-out. + * + ***/ + +#include + +#include + +#include "rte_port.h" + +/** ring_writer_ipv4_ras port parameters */ +struct rte_port_ring_writer_ras_params { + /** Underlying single consumer ring that has to be pre-initialized. */ + struct rte_ring *ring; + + /** Recommended burst size to ring. The actual burst size can be bigger + or smaller than this value. */ + uint32_t tx_burst_sz; +}; + +#define rte_port_ring_writer_ipv4_ras_params rte_port_ring_writer_ras_params + +#define rte_port_ring_writer_ipv6_ras_params rte_port_ring_writer_ras_params + +/** ring_writer_ipv4_ras port operations */ +extern struct rte_port_out_ops rte_port_ring_writer_ipv4_ras_ops; + +/** ring_writer_ipv6_ras port operations */ +extern struct rte_port_out_ops rte_port_ring_writer_ipv6_ras_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_ring.c b/src/spdk/dpdk/lib/librte_port/rte_port_ring.c new file mode 100644 index 00000000..47fcdd06 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_ring.c @@ -0,0 +1,787 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include + +#include +#include +#include + +#include "rte_port_ring.h" + +/* + * Port RING Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_RING_READER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_RING_READER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_RING_READER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_RING_READER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ring_reader { + struct rte_port_in_stats stats; + + struct rte_ring *ring; +}; + +static void * +rte_port_ring_reader_create_internal(void *params, int socket_id, + uint32_t is_multi) +{ + struct rte_port_ring_reader_params *conf = + params; + struct rte_port_ring_reader *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->ring == NULL) || + (conf->ring->cons.single && is_multi) || + (!(conf->ring->cons.single) && !is_multi)) { + RTE_LOG(ERR, PORT, "%s: Invalid Parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->ring = conf->ring; + + return port; +} + +static void * +rte_port_ring_reader_create(void *params, int socket_id) +{ + return rte_port_ring_reader_create_internal(params, socket_id, 0); +} + +static void * +rte_port_ring_multi_reader_create(void *params, int socket_id) +{ + return rte_port_ring_reader_create_internal(params, socket_id, 1); +} + +static int +rte_port_ring_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_ring_reader *p = port; + uint32_t nb_rx; + + nb_rx = rte_ring_sc_dequeue_burst(p->ring, (void **) pkts, + n_pkts, NULL); + RTE_PORT_RING_READER_STATS_PKTS_IN_ADD(p, nb_rx); + + return nb_rx; +} + +static int +rte_port_ring_multi_reader_rx(void *port, struct rte_mbuf **pkts, + uint32_t n_pkts) +{ + struct rte_port_ring_reader *p = port; + uint32_t nb_rx; + + nb_rx = rte_ring_mc_dequeue_burst(p->ring, (void **) pkts, + n_pkts, NULL); + RTE_PORT_RING_READER_STATS_PKTS_IN_ADD(p, nb_rx); + + return nb_rx; +} + +static int +rte_port_ring_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int +rte_port_ring_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_ring_reader *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port RING Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_RING_WRITER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_RING_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_RING_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_RING_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ring_writer { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + struct rte_ring *ring; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + uint32_t is_multi; +}; + +static void * +rte_port_ring_writer_create_internal(void *params, int socket_id, + uint32_t is_multi) +{ + struct rte_port_ring_writer_params *conf = + params; + struct rte_port_ring_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->ring == NULL) || + (conf->ring->prod.single && is_multi) || + (!(conf->ring->prod.single) && !is_multi) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX)) { + RTE_LOG(ERR, PORT, "%s: Invalid Parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->ring = conf->ring; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + port->is_multi = is_multi; + + return port; +} + +static void * +rte_port_ring_writer_create(void *params, int socket_id) +{ + return rte_port_ring_writer_create_internal(params, socket_id, 0); +} + +static void * +rte_port_ring_multi_writer_create(void *params, int socket_id) +{ + return rte_port_ring_writer_create_internal(params, socket_id, 1); +} + +static inline void +send_burst(struct rte_port_ring_writer *p) +{ + uint32_t nb_tx; + + nb_tx = rte_ring_sp_enqueue_burst(p->ring, (void **)p->tx_buf, + p->tx_buf_count, NULL); + + RTE_PORT_RING_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static inline void +send_burst_mp(struct rte_port_ring_writer *p) +{ + uint32_t nb_tx; + + nb_tx = rte_ring_mp_enqueue_burst(p->ring, (void **)p->tx_buf, + p->tx_buf_count, NULL); + + RTE_PORT_RING_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_ring_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ring_writer *p = port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_RING_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst(p); + + return 0; +} + +static int +rte_port_ring_multi_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ring_writer *p = port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_RING_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_mp(p); + + return 0; +} + +static __rte_always_inline int +rte_port_ring_writer_tx_bulk_internal(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint32_t is_multi) +{ + struct rte_port_ring_writer *p = + port; + + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) { + if (is_multi) + send_burst_mp(p); + else + send_burst(p); + } + + RTE_PORT_RING_WRITER_STATS_PKTS_IN_ADD(p, n_pkts); + if (is_multi) + n_pkts_ok = rte_ring_mp_enqueue_burst(p->ring, + (void **)pkts, n_pkts, NULL); + else + n_pkts_ok = rte_ring_sp_enqueue_burst(p->ring, + (void **)pkts, n_pkts, NULL); + + RTE_PORT_RING_WRITER_STATS_PKTS_DROP_ADD(p, n_pkts - n_pkts_ok); + for ( ; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + + rte_pktmbuf_free(pkt); + } + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_RING_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) { + if (is_multi) + send_burst_mp(p); + else + send_burst(p); + } + } + + return 0; +} + +static int +rte_port_ring_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + return rte_port_ring_writer_tx_bulk_internal(port, pkts, pkts_mask, 0); +} + +static int +rte_port_ring_multi_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + return rte_port_ring_writer_tx_bulk_internal(port, pkts, pkts_mask, 1); +} + +static int +rte_port_ring_writer_flush(void *port) +{ + struct rte_port_ring_writer *p = port; + + if (p->tx_buf_count > 0) + send_burst(p); + + return 0; +} + +static int +rte_port_ring_multi_writer_flush(void *port) +{ + struct rte_port_ring_writer *p = port; + + if (p->tx_buf_count > 0) + send_burst_mp(p); + + return 0; +} + +static int +rte_port_ring_writer_free(void *port) +{ + struct rte_port_ring_writer *p = port; + + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + if (p->is_multi) + rte_port_ring_multi_writer_flush(port); + else + rte_port_ring_writer_flush(port); + + rte_free(port); + + return 0; +} + +static int +rte_port_ring_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_ring_writer *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port RING Writer Nodrop + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_ring_writer_nodrop { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + struct rte_ring *ring; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; + uint64_t n_retries; + uint32_t is_multi; +}; + +static void * +rte_port_ring_writer_nodrop_create_internal(void *params, int socket_id, + uint32_t is_multi) +{ + struct rte_port_ring_writer_nodrop_params *conf = + params; + struct rte_port_ring_writer_nodrop *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->ring == NULL) || + (conf->ring->prod.single && is_multi) || + (!(conf->ring->prod.single) && !is_multi) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX)) { + RTE_LOG(ERR, PORT, "%s: Invalid Parameters\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->ring = conf->ring; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + port->is_multi = is_multi; + + /* + * When n_retries is 0 it means that we should wait for every packet to + * send no matter how many retries should it take. To limit number of + * branches in fast path, we use UINT64_MAX instead of branching. + */ + port->n_retries = (conf->n_retries == 0) ? UINT64_MAX : conf->n_retries; + + return port; +} + +static void * +rte_port_ring_writer_nodrop_create(void *params, int socket_id) +{ + return rte_port_ring_writer_nodrop_create_internal(params, socket_id, 0); +} + +static void * +rte_port_ring_multi_writer_nodrop_create(void *params, int socket_id) +{ + return rte_port_ring_writer_nodrop_create_internal(params, socket_id, 1); +} + +static inline void +send_burst_nodrop(struct rte_port_ring_writer_nodrop *p) +{ + uint32_t nb_tx = 0, i; + + nb_tx = rte_ring_sp_enqueue_burst(p->ring, (void **)p->tx_buf, + p->tx_buf_count, NULL); + + /* We sent all the packets in a first try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + + for (i = 0; i < p->n_retries; i++) { + nb_tx += rte_ring_sp_enqueue_burst(p->ring, + (void **) (p->tx_buf + nb_tx), + p->tx_buf_count - nb_tx, NULL); + + /* We sent all the packets in more than one try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + } + + /* We didn't send the packets in maximum allowed attempts */ + RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static inline void +send_burst_mp_nodrop(struct rte_port_ring_writer_nodrop *p) +{ + uint32_t nb_tx = 0, i; + + nb_tx = rte_ring_mp_enqueue_burst(p->ring, (void **)p->tx_buf, + p->tx_buf_count, NULL); + + /* We sent all the packets in a first try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + + for (i = 0; i < p->n_retries; i++) { + nb_tx += rte_ring_mp_enqueue_burst(p->ring, + (void **) (p->tx_buf + nb_tx), + p->tx_buf_count - nb_tx, NULL); + + /* We sent all the packets in more than one try */ + if (nb_tx >= p->tx_buf_count) { + p->tx_buf_count = 0; + return; + } + } + + /* We didn't send the packets in maximum allowed attempts */ + RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + for ( ; nb_tx < p->tx_buf_count; nb_tx++) + rte_pktmbuf_free(p->tx_buf[nb_tx]); + + p->tx_buf_count = 0; +} + +static int +rte_port_ring_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_ring_multi_writer_nodrop_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) + send_burst_mp_nodrop(p); + + return 0; +} + +static __rte_always_inline int +rte_port_ring_writer_nodrop_tx_bulk_internal(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint32_t is_multi) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t n_pkts_ok; + + if (tx_buf_count) { + if (is_multi) + send_burst_mp_nodrop(p); + else + send_burst_nodrop(p); + } + + RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_IN_ADD(p, n_pkts); + if (is_multi) + n_pkts_ok = + rte_ring_mp_enqueue_burst(p->ring, + (void **)pkts, n_pkts, NULL); + else + n_pkts_ok = + rte_ring_sp_enqueue_burst(p->ring, + (void **)pkts, n_pkts, NULL); + + if (n_pkts_ok >= n_pkts) + return 0; + + /* + * If we didn't manage to send all packets in single burst, move + * remaining packets to the buffer and call send burst. + */ + for (; n_pkts_ok < n_pkts; n_pkts_ok++) { + struct rte_mbuf *pkt = pkts[n_pkts_ok]; + + p->tx_buf[p->tx_buf_count++] = pkt; + } + if (is_multi) + send_burst_mp_nodrop(p); + else + send_burst_nodrop(p); + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_RING_WRITER_NODROP_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + + p->tx_buf_count = tx_buf_count; + if (tx_buf_count >= p->tx_burst_sz) { + if (is_multi) + send_burst_mp_nodrop(p); + else + send_burst_nodrop(p); + } + } + + return 0; +} + +static int +rte_port_ring_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + return + rte_port_ring_writer_nodrop_tx_bulk_internal(port, pkts, pkts_mask, 0); +} + +static int +rte_port_ring_multi_writer_nodrop_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + return + rte_port_ring_writer_nodrop_tx_bulk_internal(port, pkts, pkts_mask, 1); +} + +static int +rte_port_ring_writer_nodrop_flush(void *port) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + if (p->tx_buf_count > 0) + send_burst_nodrop(p); + + return 0; +} + +static int +rte_port_ring_multi_writer_nodrop_flush(void *port) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + if (p->tx_buf_count > 0) + send_burst_mp_nodrop(p); + + return 0; +} + +static int +rte_port_ring_writer_nodrop_free(void *port) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Port is NULL\n", __func__); + return -EINVAL; + } + + if (p->is_multi) + rte_port_ring_multi_writer_nodrop_flush(port); + else + rte_port_ring_writer_nodrop_flush(port); + + rte_free(port); + + return 0; +} + +static int +rte_port_ring_writer_nodrop_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_ring_writer_nodrop *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_ring_reader_ops = { + .f_create = rte_port_ring_reader_create, + .f_free = rte_port_ring_reader_free, + .f_rx = rte_port_ring_reader_rx, + .f_stats = rte_port_ring_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_ring_writer_ops = { + .f_create = rte_port_ring_writer_create, + .f_free = rte_port_ring_writer_free, + .f_tx = rte_port_ring_writer_tx, + .f_tx_bulk = rte_port_ring_writer_tx_bulk, + .f_flush = rte_port_ring_writer_flush, + .f_stats = rte_port_ring_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_ring_writer_nodrop_ops = { + .f_create = rte_port_ring_writer_nodrop_create, + .f_free = rte_port_ring_writer_nodrop_free, + .f_tx = rte_port_ring_writer_nodrop_tx, + .f_tx_bulk = rte_port_ring_writer_nodrop_tx_bulk, + .f_flush = rte_port_ring_writer_nodrop_flush, + .f_stats = rte_port_ring_writer_nodrop_stats_read, +}; + +struct rte_port_in_ops rte_port_ring_multi_reader_ops = { + .f_create = rte_port_ring_multi_reader_create, + .f_free = rte_port_ring_reader_free, + .f_rx = rte_port_ring_multi_reader_rx, + .f_stats = rte_port_ring_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_ring_multi_writer_ops = { + .f_create = rte_port_ring_multi_writer_create, + .f_free = rte_port_ring_writer_free, + .f_tx = rte_port_ring_multi_writer_tx, + .f_tx_bulk = rte_port_ring_multi_writer_tx_bulk, + .f_flush = rte_port_ring_multi_writer_flush, + .f_stats = rte_port_ring_writer_stats_read, +}; + +struct rte_port_out_ops rte_port_ring_multi_writer_nodrop_ops = { + .f_create = rte_port_ring_multi_writer_nodrop_create, + .f_free = rte_port_ring_writer_nodrop_free, + .f_tx = rte_port_ring_multi_writer_nodrop_tx, + .f_tx_bulk = rte_port_ring_multi_writer_nodrop_tx_bulk, + .f_flush = rte_port_ring_multi_writer_nodrop_flush, + .f_stats = rte_port_ring_writer_nodrop_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_ring.h b/src/spdk/dpdk/lib/librte_port/rte_port_ring.h new file mode 100644 index 00000000..c4f34e22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_ring.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_RING_H__ +#define __INCLUDE_RTE_PORT_RING_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port Ring + * + * ring_reader: + * input port built on top of pre-initialized single consumer ring + * ring_writer: + * output port built on top of pre-initialized single producer ring + * ring_multi_reader: + * input port built on top of pre-initialized multi consumers ring + * ring_multi_writer: + * output port built on top of pre-initialized multi producers ring + * + ***/ + +#include + +#include + +#include "rte_port.h" + +/** ring_reader port parameters */ +struct rte_port_ring_reader_params { + /** Underlying consumer ring that has to be pre-initialized */ + struct rte_ring *ring; +}; + +/** ring_reader port operations */ +extern struct rte_port_in_ops rte_port_ring_reader_ops; + +/** ring_writer port parameters */ +struct rte_port_ring_writer_params { + /** Underlying producer ring that has to be pre-initialized */ + struct rte_ring *ring; + + /** Recommended burst size to ring. The actual burst size can be + bigger or smaller than this value. */ + uint32_t tx_burst_sz; +}; + +/** ring_writer port operations */ +extern struct rte_port_out_ops rte_port_ring_writer_ops; + +/** ring_writer_nodrop port parameters */ +struct rte_port_ring_writer_nodrop_params { + /** Underlying producer ring that has to be pre-initialized */ + struct rte_ring *ring; + + /** Recommended burst size to ring. The actual burst size can be + bigger or smaller than this value. */ + uint32_t tx_burst_sz; + + /** Maximum number of retries, 0 for no limit */ + uint32_t n_retries; +}; + +/** ring_writer_nodrop port operations */ +extern struct rte_port_out_ops rte_port_ring_writer_nodrop_ops; + +/** ring_multi_reader port parameters */ +#define rte_port_ring_multi_reader_params rte_port_ring_reader_params + +/** ring_multi_reader port operations */ +extern struct rte_port_in_ops rte_port_ring_multi_reader_ops; + +/** ring_multi_writer port parameters */ +#define rte_port_ring_multi_writer_params rte_port_ring_writer_params + +/** ring_multi_writer port operations */ +extern struct rte_port_out_ops rte_port_ring_multi_writer_ops; + +/** ring_multi_writer_nodrop port parameters */ +#define rte_port_ring_multi_writer_nodrop_params \ + rte_port_ring_writer_nodrop_params + +/** ring_multi_writer_nodrop port operations */ +extern struct rte_port_out_ops rte_port_ring_multi_writer_nodrop_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_sched.c b/src/spdk/dpdk/lib/librte_port/rte_port_sched.c new file mode 100644 index 00000000..1209fc12 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_sched.c @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include + +#include +#include + +#include "rte_port_sched.h" + +/* + * Reader + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SCHED_READER_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_SCHED_READER_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_SCHED_READER_PKTS_IN_ADD(port, val) +#define RTE_PORT_SCHED_READER_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_sched_reader { + struct rte_port_in_stats stats; + + struct rte_sched_port *sched; +}; + +static void * +rte_port_sched_reader_create(void *params, int socket_id) +{ + struct rte_port_sched_reader_params *conf = + params; + struct rte_port_sched_reader *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->sched == NULL)) { + RTE_LOG(ERR, PORT, "%s: Invalid params\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->sched = conf->sched; + + return port; +} + +static int +rte_port_sched_reader_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_sched_reader *p = port; + uint32_t nb_rx; + + nb_rx = rte_sched_port_dequeue(p->sched, pkts, n_pkts); + RTE_PORT_SCHED_READER_PKTS_IN_ADD(p, nb_rx); + + return nb_rx; +} + +static int +rte_port_sched_reader_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_free(port); + + return 0; +} + +static int +rte_port_sched_reader_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_sched_reader *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Writer + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SCHED_WRITER_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_SCHED_WRITER_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_sched_writer { + struct rte_port_out_stats stats; + + struct rte_mbuf *tx_buf[2 * RTE_PORT_IN_BURST_SIZE_MAX]; + struct rte_sched_port *sched; + uint32_t tx_burst_sz; + uint32_t tx_buf_count; + uint64_t bsz_mask; +}; + +static void * +rte_port_sched_writer_create(void *params, int socket_id) +{ + struct rte_port_sched_writer_params *conf = + params; + struct rte_port_sched_writer *port; + + /* Check input parameters */ + if ((conf == NULL) || + (conf->sched == NULL) || + (conf->tx_burst_sz == 0) || + (conf->tx_burst_sz > RTE_PORT_IN_BURST_SIZE_MAX) || + (!rte_is_power_of_2(conf->tx_burst_sz))) { + RTE_LOG(ERR, PORT, "%s: Invalid params\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->sched = conf->sched; + port->tx_burst_sz = conf->tx_burst_sz; + port->tx_buf_count = 0; + port->bsz_mask = 1LLU << (conf->tx_burst_sz - 1); + + return port; +} + +static int +rte_port_sched_writer_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_sched_writer *p = (struct rte_port_sched_writer *) port; + + p->tx_buf[p->tx_buf_count++] = pkt; + RTE_PORT_SCHED_WRITER_STATS_PKTS_IN_ADD(p, 1); + if (p->tx_buf_count >= p->tx_burst_sz) { + __rte_unused uint32_t nb_tx; + + nb_tx = rte_sched_port_enqueue(p->sched, p->tx_buf, p->tx_buf_count); + RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + p->tx_buf_count = 0; + } + + return 0; +} + +static int +rte_port_sched_writer_tx_bulk(void *port, + struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_sched_writer *p = (struct rte_port_sched_writer *) port; + uint64_t bsz_mask = p->bsz_mask; + uint32_t tx_buf_count = p->tx_buf_count; + uint64_t expr = (pkts_mask & (pkts_mask + 1)) | + ((pkts_mask & bsz_mask) ^ bsz_mask); + + if (expr == 0) { + __rte_unused uint32_t nb_tx; + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + + if (tx_buf_count) { + nb_tx = rte_sched_port_enqueue(p->sched, p->tx_buf, + tx_buf_count); + RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(p, tx_buf_count - nb_tx); + p->tx_buf_count = 0; + } + + nb_tx = rte_sched_port_enqueue(p->sched, pkts, n_pkts); + RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(p, n_pkts - nb_tx); + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + p->tx_buf[tx_buf_count++] = pkt; + RTE_PORT_SCHED_WRITER_STATS_PKTS_IN_ADD(p, 1); + pkts_mask &= ~pkt_mask; + } + p->tx_buf_count = tx_buf_count; + + if (tx_buf_count >= p->tx_burst_sz) { + __rte_unused uint32_t nb_tx; + + nb_tx = rte_sched_port_enqueue(p->sched, p->tx_buf, + tx_buf_count); + RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(p, tx_buf_count - nb_tx); + p->tx_buf_count = 0; + } + } + + return 0; +} + +static int +rte_port_sched_writer_flush(void *port) +{ + struct rte_port_sched_writer *p = (struct rte_port_sched_writer *) port; + + if (p->tx_buf_count) { + __rte_unused uint32_t nb_tx; + + nb_tx = rte_sched_port_enqueue(p->sched, p->tx_buf, p->tx_buf_count); + RTE_PORT_SCHED_WRITER_STATS_PKTS_DROP_ADD(p, p->tx_buf_count - nb_tx); + p->tx_buf_count = 0; + } + + return 0; +} + +static int +rte_port_sched_writer_free(void *port) +{ + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: port is NULL\n", __func__); + return -EINVAL; + } + + rte_port_sched_writer_flush(port); + rte_free(port); + + return 0; +} + +static int +rte_port_sched_writer_stats_read(void *port, + struct rte_port_out_stats *stats, int clear) +{ + struct rte_port_sched_writer *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_sched_reader_ops = { + .f_create = rte_port_sched_reader_create, + .f_free = rte_port_sched_reader_free, + .f_rx = rte_port_sched_reader_rx, + .f_stats = rte_port_sched_reader_stats_read, +}; + +struct rte_port_out_ops rte_port_sched_writer_ops = { + .f_create = rte_port_sched_writer_create, + .f_free = rte_port_sched_writer_free, + .f_tx = rte_port_sched_writer_tx, + .f_tx_bulk = rte_port_sched_writer_tx_bulk, + .f_flush = rte_port_sched_writer_flush, + .f_stats = rte_port_sched_writer_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_sched.h b/src/spdk/dpdk/lib/librte_port/rte_port_sched.h new file mode 100644 index 00000000..95345146 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_sched.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_SCHED_H__ +#define __INCLUDE_RTE_PORT_SCHED_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port Hierarchical Scheduler + * + * sched_reader: input port built on top of pre-initialized rte_sched_port + * sched_writer: output port built on top of pre-initialized rte_sched_port + * + ***/ + +#include + +#include + +#include "rte_port.h" + +/** sched_reader port parameters */ +struct rte_port_sched_reader_params { + /** Underlying pre-initialized rte_sched_port */ + struct rte_sched_port *sched; +}; + +/** sched_reader port operations */ +extern struct rte_port_in_ops rte_port_sched_reader_ops; + +/** sched_writer port parameters */ +struct rte_port_sched_writer_params { + /** Underlying pre-initialized rte_sched_port */ + struct rte_sched_port *sched; + + /** Recommended burst size. The actual burst size can be bigger or + smaller than this value. */ + uint32_t tx_burst_sz; +}; + +/** sched_writer port operations */ +extern struct rte_port_out_ops rte_port_sched_writer_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_source_sink.c b/src/spdk/dpdk/lib/librte_port/rte_port_source_sink.c new file mode 100644 index 00000000..54045f95 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_source_sink.c @@ -0,0 +1,619 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ +#include +#include + +#include +#include +#include +#include + +#ifdef RTE_PORT_PCAP +#include +#include +#endif + +#include "rte_port_source_sink.h" + +/* + * Port SOURCE + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SOURCE_STATS_PKTS_IN_ADD(port, val) \ + port->stats.n_pkts_in += val +#define RTE_PORT_SOURCE_STATS_PKTS_DROP_ADD(port, val) \ + port->stats.n_pkts_drop += val + +#else + +#define RTE_PORT_SOURCE_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_SOURCE_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_source { + struct rte_port_in_stats stats; + + struct rte_mempool *mempool; + + /* PCAP buffers and indices */ + uint8_t **pkts; + uint8_t *pkt_buff; + uint32_t *pkt_len; + uint32_t n_pkts; + uint32_t pkt_index; +}; + +#ifdef RTE_PORT_PCAP + +static int +pcap_source_load(struct rte_port_source *port, + const char *file_name, + uint32_t n_bytes_per_pkt, + int socket_id) +{ + uint32_t n_pkts = 0; + uint32_t i; + uint32_t *pkt_len_aligns = NULL; + size_t total_buff_len = 0; + pcap_t *pcap_handle; + char pcap_errbuf[PCAP_ERRBUF_SIZE]; + uint32_t max_len; + struct pcap_pkthdr pcap_hdr; + const uint8_t *pkt; + uint8_t *buff = NULL; + uint32_t pktmbuf_maxlen = (uint32_t) + (rte_pktmbuf_data_room_size(port->mempool) - + RTE_PKTMBUF_HEADROOM); + + if (n_bytes_per_pkt == 0) + max_len = pktmbuf_maxlen; + else + max_len = RTE_MIN(n_bytes_per_pkt, pktmbuf_maxlen); + + /* first time open, get packet number */ + pcap_handle = pcap_open_offline(file_name, pcap_errbuf); + if (pcap_handle == NULL) { + RTE_LOG(ERR, PORT, "Failed to open pcap file " + "'%s' for reading\n", file_name); + goto error_exit; + } + + while ((pkt = pcap_next(pcap_handle, &pcap_hdr)) != NULL) + n_pkts++; + + pcap_close(pcap_handle); + + port->pkt_len = rte_zmalloc_socket("PCAP", + (sizeof(*port->pkt_len) * n_pkts), 0, socket_id); + if (port->pkt_len == NULL) { + RTE_LOG(ERR, PORT, "No enough memory\n"); + goto error_exit; + } + + pkt_len_aligns = rte_malloc("PCAP", + (sizeof(*pkt_len_aligns) * n_pkts), 0); + if (pkt_len_aligns == NULL) { + RTE_LOG(ERR, PORT, "No enough memory\n"); + goto error_exit; + } + + port->pkts = rte_zmalloc_socket("PCAP", + (sizeof(*port->pkts) * n_pkts), 0, socket_id); + if (port->pkts == NULL) { + RTE_LOG(ERR, PORT, "No enough memory\n"); + goto error_exit; + } + + /* open 2nd time, get pkt_len */ + pcap_handle = pcap_open_offline(file_name, pcap_errbuf); + if (pcap_handle == NULL) { + RTE_LOG(ERR, PORT, "Failed to open pcap file " + "'%s' for reading\n", file_name); + goto error_exit; + } + + for (i = 0; i < n_pkts; i++) { + pkt = pcap_next(pcap_handle, &pcap_hdr); + port->pkt_len[i] = RTE_MIN(max_len, pcap_hdr.len); + pkt_len_aligns[i] = RTE_CACHE_LINE_ROUNDUP( + port->pkt_len[i]); + total_buff_len += pkt_len_aligns[i]; + } + + pcap_close(pcap_handle); + + /* allocate a big trunk of data for pcap file load */ + buff = rte_zmalloc_socket("PCAP", + total_buff_len, 0, socket_id); + if (buff == NULL) { + RTE_LOG(ERR, PORT, "No enough memory\n"); + goto error_exit; + } + + port->pkt_buff = buff; + + /* open file one last time to copy the pkt content */ + pcap_handle = pcap_open_offline(file_name, pcap_errbuf); + if (pcap_handle == NULL) { + RTE_LOG(ERR, PORT, "Failed to open pcap file " + "'%s' for reading\n", file_name); + goto error_exit; + } + + for (i = 0; i < n_pkts; i++) { + pkt = pcap_next(pcap_handle, &pcap_hdr); + rte_memcpy(buff, pkt, port->pkt_len[i]); + port->pkts[i] = buff; + buff += pkt_len_aligns[i]; + } + + pcap_close(pcap_handle); + + port->n_pkts = n_pkts; + + rte_free(pkt_len_aligns); + + RTE_LOG(INFO, PORT, "Successfully load pcap file " + "'%s' with %u pkts\n", + file_name, port->n_pkts); + + return 0; + +error_exit: + if (pkt_len_aligns) + rte_free(pkt_len_aligns); + if (port->pkt_len) + rte_free(port->pkt_len); + if (port->pkts) + rte_free(port->pkts); + if (port->pkt_buff) + rte_free(port->pkt_buff); + + return -1; +} + +#define PCAP_SOURCE_LOAD(port, file_name, n_bytes, socket_id) \ + pcap_source_load(port, file_name, n_bytes, socket_id) + +#else /* RTE_PORT_PCAP */ + +#define PCAP_SOURCE_LOAD(port, file_name, n_bytes, socket_id) \ +({ \ + int _ret = 0; \ + \ + if (file_name) { \ + RTE_LOG(ERR, PORT, "Source port field " \ + "\"file_name\" is not NULL.\n"); \ + _ret = -1; \ + } \ + \ + _ret; \ +}) + +#endif /* RTE_PORT_PCAP */ + +static void * +rte_port_source_create(void *params, int socket_id) +{ + struct rte_port_source_params *p = + params; + struct rte_port_source *port; + + /* Check input arguments*/ + if ((p == NULL) || (p->mempool == NULL)) { + RTE_LOG(ERR, PORT, "%s: Invalid params\n", __func__); + return NULL; + } + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + /* Initialization */ + port->mempool = (struct rte_mempool *) p->mempool; + + if (p->file_name) { + int status = PCAP_SOURCE_LOAD(port, p->file_name, + p->n_bytes_per_pkt, socket_id); + + if (status < 0) { + rte_free(port); + port = NULL; + } + } + + return port; +} + +static int +rte_port_source_free(void *port) +{ + struct rte_port_source *p = + port; + + /* Check input parameters */ + if (p == NULL) + return 0; + + if (p->pkt_len) + rte_free(p->pkt_len); + if (p->pkts) + rte_free(p->pkts); + if (p->pkt_buff) + rte_free(p->pkt_buff); + + rte_free(p); + + return 0; +} + +static int +rte_port_source_rx(void *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + struct rte_port_source *p = port; + uint32_t i; + + if (rte_pktmbuf_alloc_bulk(p->mempool, pkts, n_pkts) != 0) + return 0; + + if (p->pkt_buff != NULL) { + for (i = 0; i < n_pkts; i++) { + uint8_t *pkt_data = rte_pktmbuf_mtod(pkts[i], + uint8_t *); + + rte_memcpy(pkt_data, p->pkts[p->pkt_index], + p->pkt_len[p->pkt_index]); + pkts[i]->data_len = p->pkt_len[p->pkt_index]; + pkts[i]->pkt_len = pkts[i]->data_len; + + p->pkt_index++; + if (p->pkt_index >= p->n_pkts) + p->pkt_index = 0; + } + } + + RTE_PORT_SOURCE_STATS_PKTS_IN_ADD(p, n_pkts); + + return n_pkts; +} + +static int +rte_port_source_stats_read(void *port, + struct rte_port_in_stats *stats, int clear) +{ + struct rte_port_source *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Port SINK + */ +#ifdef RTE_PORT_STATS_COLLECT + +#define RTE_PORT_SINK_STATS_PKTS_IN_ADD(port, val) \ + (port->stats.n_pkts_in += val) +#define RTE_PORT_SINK_STATS_PKTS_DROP_ADD(port, val) \ + (port->stats.n_pkts_drop += val) + +#else + +#define RTE_PORT_SINK_STATS_PKTS_IN_ADD(port, val) +#define RTE_PORT_SINK_STATS_PKTS_DROP_ADD(port, val) + +#endif + +struct rte_port_sink { + struct rte_port_out_stats stats; + + /* PCAP dumper handle and pkts number */ + void *dumper; + uint32_t max_pkts; + uint32_t pkt_index; + uint32_t dump_finish; +}; + +#ifdef RTE_PORT_PCAP + +static int +pcap_sink_open(struct rte_port_sink *port, + const char *file_name, + uint32_t max_n_pkts) +{ + pcap_t *tx_pcap; + pcap_dumper_t *pcap_dumper; + + /** Open a dead pcap handler for opening dumper file */ + tx_pcap = pcap_open_dead(DLT_EN10MB, 65535); + if (tx_pcap == NULL) { + RTE_LOG(ERR, PORT, "Cannot open pcap dead handler\n"); + return -1; + } + + /* The dumper is created using the previous pcap_t reference */ + pcap_dumper = pcap_dump_open(tx_pcap, file_name); + if (pcap_dumper == NULL) { + RTE_LOG(ERR, PORT, "Failed to open pcap file " + "\"%s\" for writing\n", file_name); + return -1; + } + + port->dumper = pcap_dumper; + port->max_pkts = max_n_pkts; + port->pkt_index = 0; + port->dump_finish = 0; + + RTE_LOG(INFO, PORT, "Ready to dump packets to file \"%s\"\n", + file_name); + + return 0; +} + +static void +pcap_sink_write_pkt(struct rte_port_sink *port, struct rte_mbuf *mbuf) +{ + uint8_t *pcap_dumper = (port->dumper); + struct pcap_pkthdr pcap_hdr; + uint8_t jumbo_pkt_buf[ETHER_MAX_JUMBO_FRAME_LEN]; + uint8_t *pkt; + + /* Maximum num packets already reached */ + if (port->dump_finish) + return; + + pkt = rte_pktmbuf_mtod(mbuf, uint8_t *); + + pcap_hdr.len = mbuf->pkt_len; + pcap_hdr.caplen = pcap_hdr.len; + gettimeofday(&(pcap_hdr.ts), NULL); + + if (mbuf->nb_segs > 1) { + struct rte_mbuf *jumbo_mbuf; + uint32_t pkt_index = 0; + + /* if packet size longer than ETHER_MAX_JUMBO_FRAME_LEN, + * ignore it. + */ + if (mbuf->pkt_len > ETHER_MAX_JUMBO_FRAME_LEN) + return; + + for (jumbo_mbuf = mbuf; jumbo_mbuf != NULL; + jumbo_mbuf = jumbo_mbuf->next) { + rte_memcpy(&jumbo_pkt_buf[pkt_index], + rte_pktmbuf_mtod(jumbo_mbuf, uint8_t *), + jumbo_mbuf->data_len); + pkt_index += jumbo_mbuf->data_len; + } + + jumbo_pkt_buf[pkt_index] = '\0'; + + pkt = jumbo_pkt_buf; + } + + pcap_dump(pcap_dumper, &pcap_hdr, pkt); + + port->pkt_index++; + + if ((port->max_pkts != 0) && (port->pkt_index >= port->max_pkts)) { + port->dump_finish = 1; + RTE_LOG(INFO, PORT, "Dumped %u packets to file\n", + port->pkt_index); + } + +} + +#define PCAP_SINK_OPEN(port, file_name, max_n_pkts) \ + pcap_sink_open(port, file_name, max_n_pkts) + +#define PCAP_SINK_WRITE_PKT(port, mbuf) \ + pcap_sink_write_pkt(port, mbuf) + +#define PCAP_SINK_FLUSH_PKT(dumper) \ +do { \ + if (dumper) \ + pcap_dump_flush((pcap_dumper_t *)dumper); \ +} while (0) + +#define PCAP_SINK_CLOSE(dumper) \ +do { \ + if (dumper) \ + pcap_dump_close((pcap_dumper_t *)dumper); \ +} while (0) + +#else + +#define PCAP_SINK_OPEN(port, file_name, max_n_pkts) \ +({ \ + int _ret = 0; \ + \ + if (file_name) { \ + RTE_LOG(ERR, PORT, "Sink port field " \ + "\"file_name\" is not NULL.\n"); \ + _ret = -1; \ + } \ + \ + _ret; \ +}) + +#define PCAP_SINK_WRITE_PKT(port, mbuf) {} + +#define PCAP_SINK_FLUSH_PKT(dumper) + +#define PCAP_SINK_CLOSE(dumper) + +#endif + +static void * +rte_port_sink_create(void *params, int socket_id) +{ + struct rte_port_sink *port; + struct rte_port_sink_params *p = params; + + /* Memory allocation */ + port = rte_zmalloc_socket("PORT", sizeof(*port), + RTE_CACHE_LINE_SIZE, socket_id); + if (port == NULL) { + RTE_LOG(ERR, PORT, "%s: Failed to allocate port\n", __func__); + return NULL; + } + + if (!p) + return port; + + if (p->file_name) { + int status = PCAP_SINK_OPEN(port, p->file_name, + p->max_n_pkts); + + if (status < 0) { + rte_free(port); + port = NULL; + } + } + + return port; +} + +static int +rte_port_sink_tx(void *port, struct rte_mbuf *pkt) +{ + struct rte_port_sink *p = port; + + RTE_PORT_SINK_STATS_PKTS_IN_ADD(p, 1); + if (p->dumper != NULL) + PCAP_SINK_WRITE_PKT(p, pkt); + rte_pktmbuf_free(pkt); + RTE_PORT_SINK_STATS_PKTS_DROP_ADD(p, 1); + + return 0; +} + +static int +rte_port_sink_tx_bulk(void *port, struct rte_mbuf **pkts, + uint64_t pkts_mask) +{ + struct rte_port_sink *p = port; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + RTE_PORT_SINK_STATS_PKTS_IN_ADD(p, n_pkts); + RTE_PORT_SINK_STATS_PKTS_DROP_ADD(p, n_pkts); + + if (p->dumper) { + for (i = 0; i < n_pkts; i++) + PCAP_SINK_WRITE_PKT(p, pkts[i]); + } + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + + rte_pktmbuf_free(pkt); + } + + } else { + if (p->dumper) { + uint64_t dump_pkts_mask = pkts_mask; + uint32_t pkt_index; + + for ( ; dump_pkts_mask; ) { + pkt_index = __builtin_ctzll( + dump_pkts_mask); + PCAP_SINK_WRITE_PKT(p, pkts[pkt_index]); + dump_pkts_mask &= ~(1LLU << pkt_index); + } + } + + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + + RTE_PORT_SINK_STATS_PKTS_IN_ADD(p, 1); + RTE_PORT_SINK_STATS_PKTS_DROP_ADD(p, 1); + rte_pktmbuf_free(pkt); + pkts_mask &= ~pkt_mask; + } + } + + return 0; +} + +static int +rte_port_sink_flush(void *port) +{ + struct rte_port_sink *p = + port; + + if (p == NULL) + return 0; + + PCAP_SINK_FLUSH_PKT(p->dumper); + + return 0; +} + +static int +rte_port_sink_free(void *port) +{ + struct rte_port_sink *p = + port; + + if (p == NULL) + return 0; + + PCAP_SINK_CLOSE(p->dumper); + + rte_free(p); + + return 0; +} + +static int +rte_port_sink_stats_read(void *port, struct rte_port_out_stats *stats, + int clear) +{ + struct rte_port_sink *p = + port; + + if (stats != NULL) + memcpy(stats, &p->stats, sizeof(p->stats)); + + if (clear) + memset(&p->stats, 0, sizeof(p->stats)); + + return 0; +} + +/* + * Summary of port operations + */ +struct rte_port_in_ops rte_port_source_ops = { + .f_create = rte_port_source_create, + .f_free = rte_port_source_free, + .f_rx = rte_port_source_rx, + .f_stats = rte_port_source_stats_read, +}; + +struct rte_port_out_ops rte_port_sink_ops = { + .f_create = rte_port_sink_create, + .f_free = rte_port_sink_free, + .f_tx = rte_port_sink_tx, + .f_tx_bulk = rte_port_sink_tx_bulk, + .f_flush = rte_port_sink_flush, + .f_stats = rte_port_sink_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_source_sink.h b/src/spdk/dpdk/lib/librte_port/rte_port_source_sink.h new file mode 100644 index 00000000..16b8318e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_source_sink.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_PORT_SOURCE_SINK_H__ +#define __INCLUDE_RTE_PORT_SOURCE_SINK_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Port Source/Sink + * + * source: input port that can be used to generate packets + * sink: output port that drops all packets written to it + * + ***/ + +#include "rte_port.h" + +/** source port parameters */ +struct rte_port_source_params { + /** Pre-initialized buffer pool */ + struct rte_mempool *mempool; + + /** The full path of the pcap file to read packets from */ + const char *file_name; + /** The number of bytes to be read from each packet in the + * pcap file. If this value is 0, the whole packet is read; + * if it is bigger than packet size, the generated packets + * will contain the whole packet */ + uint32_t n_bytes_per_pkt; +}; + +/** source port operations */ +extern struct rte_port_in_ops rte_port_source_ops; + +/** sink port parameters */ +struct rte_port_sink_params { + /** The full path of the pcap file to write the packets to */ + const char *file_name; + /** The maximum number of packets write to the pcap file. + * If this value is 0, the "infinite" write will be carried + * out. + */ + uint32_t max_n_pkts; +}; + +/** sink port operations */ +extern struct rte_port_out_ops rte_port_sink_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_port/rte_port_version.map b/src/spdk/dpdk/lib/librte_port/rte_port_version.map new file mode 100644 index 00000000..6470629b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_port/rte_port_version.map @@ -0,0 +1,53 @@ +DPDK_2.0 { + global: + + rte_port_ethdev_reader_ops; + rte_port_ethdev_writer_ops; + rte_port_ring_reader_ipv4_frag_ops; + rte_port_ring_reader_ops; + rte_port_ring_writer_ipv4_ras_ops; + rte_port_ring_writer_ops; + rte_port_sched_reader_ops; + rte_port_sched_writer_ops; + rte_port_sink_ops; + rte_port_source_ops; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_port_ethdev_writer_nodrop_ops; + rte_port_ring_reader_ipv6_frag_ops; + rte_port_ring_writer_ipv6_ras_ops; + rte_port_ring_writer_nodrop_ops; + +} DPDK_2.0; + +DPDK_2.2 { + global: + + rte_port_ring_multi_reader_ops; + rte_port_ring_multi_writer_ops; + rte_port_ring_multi_writer_nodrop_ops; + +} DPDK_2.1; + +DPDK_16.07 { + global: + + rte_port_kni_reader_ops; + rte_port_kni_writer_ops; + rte_port_kni_writer_nodrop_ops; + +} DPDK_2.2; + +DPDK_16.11 { + global: + + rte_port_fd_reader_ops; + rte_port_fd_writer_ops; + rte_port_fd_writer_nodrop_ops; + +} DPDK_16.07; diff --git a/src/spdk/dpdk/lib/librte_power/Makefile b/src/spdk/dpdk/lib/librte_power/Makefile new file mode 100644 index 00000000..6f85e885 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_power.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_power_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_POWER) := rte_power.c power_acpi_cpufreq.c +SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_kvm_vm.c guest_channel.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_power/channel_commands.h b/src/spdk/dpdk/lib/librte_power/channel_commands.h new file mode 100644 index 00000000..ee638eef --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/channel_commands.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef CHANNEL_COMMANDS_H_ +#define CHANNEL_COMMANDS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Maximum number of channels per VM */ +#define CHANNEL_CMDS_MAX_VM_CHANNELS 64 + +/* Valid Commands */ +#define CPU_POWER 1 +#define CPU_POWER_CONNECT 2 +#define PKT_POLICY 3 + +/* CPU Power Command Scaling */ +#define CPU_POWER_SCALE_UP 1 +#define CPU_POWER_SCALE_DOWN 2 +#define CPU_POWER_SCALE_MAX 3 +#define CPU_POWER_SCALE_MIN 4 +#define CPU_POWER_ENABLE_TURBO 5 +#define CPU_POWER_DISABLE_TURBO 6 +#define HOURS 24 + +#define MAX_VFS 10 +#define VM_MAX_NAME_SZ 32 + +#define MAX_VCPU_PER_VM 8 + +struct t_boost_status { + bool tbEnabled; +}; + +struct timer_profile { + int busy_hours[HOURS]; + int quiet_hours[HOURS]; + int hours_to_use_traffic_profile[HOURS]; +}; + +enum workload {HIGH, MEDIUM, LOW}; +enum policy_to_use { + TRAFFIC, + TIME, + WORKLOAD, + BRANCH_RATIO +}; + +struct traffic { + uint32_t min_packet_thresh; + uint32_t avg_max_packet_thresh; + uint32_t max_max_packet_thresh; +}; + +struct channel_packet { + uint64_t resource_id; /**< core_num, device */ + uint32_t unit; /**< scale down/up/min/max */ + uint32_t command; /**< Power, IO, etc */ + char vm_name[VM_MAX_NAME_SZ]; + + uint64_t vfid[MAX_VFS]; + int nb_mac_to_monitor; + struct traffic traffic_policy; + uint8_t vcpu_to_control[MAX_VCPU_PER_VM]; + uint8_t num_vcpu; + struct timer_profile timer_policy; + enum workload workload; + enum policy_to_use policy_to_use; + struct t_boost_status t_boost_status; +}; + + +#ifdef __cplusplus +} +#endif + +#endif /* CHANNEL_COMMANDS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_power/guest_channel.c b/src/spdk/dpdk/lib/librte_power/guest_channel.c new file mode 100644 index 00000000..c17ea46b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/guest_channel.c @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include "guest_channel.h" +#include "channel_commands.h" + +#define RTE_LOGTYPE_GUEST_CHANNEL RTE_LOGTYPE_USER1 + +static int global_fds[RTE_MAX_LCORE]; + +int +guest_channel_host_connect(const char *path, unsigned int lcore_id) +{ + int flags, ret; + struct channel_packet pkt; + char fd_path[PATH_MAX]; + int fd = -1; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + /* check if path is already open */ + if (global_fds[lcore_id] != 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is already open with fd %d\n", + lcore_id, global_fds[lcore_id]); + return -1; + } + + snprintf(fd_path, PATH_MAX, "%s.%u", path, lcore_id); + RTE_LOG(INFO, GUEST_CHANNEL, "Opening channel '%s' for lcore %u\n", + fd_path, lcore_id); + fd = open(fd_path, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Unable to to connect to '%s' with error " + "%s\n", fd_path, strerror(errno)); + return -1; + } + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Failed on fcntl get flags for file %s\n", + fd_path); + goto error; + } + + flags |= O_NONBLOCK; + if (fcntl(fd, F_SETFL, flags) < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Failed on setting non-blocking mode for " + "file %s", fd_path); + goto error; + } + /* QEMU needs a delay after connection */ + sleep(1); + + /* Send a test packet, this command is ignored by the host, but a successful + * send indicates that the host endpoint is monitoring. + */ + pkt.command = CPU_POWER_CONNECT; + global_fds[lcore_id] = fd; + ret = guest_channel_send_msg(&pkt, lcore_id); + if (ret != 0) { + RTE_LOG(ERR, GUEST_CHANNEL, + "Error on channel '%s' communications test: %s\n", + fd_path, ret > 0 ? strerror(ret) : + "channel not connected"); + goto error; + } + RTE_LOG(INFO, GUEST_CHANNEL, "Channel '%s' is now connected\n", fd_path); + return 0; +error: + close(fd); + global_fds[lcore_id] = 0; + return -1; +} + +int +guest_channel_send_msg(struct channel_packet *pkt, unsigned int lcore_id) +{ + int ret, buffer_len = sizeof(*pkt); + void *buffer = pkt; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + + if (global_fds[lcore_id] == 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel is not connected\n"); + return -1; + } + while (buffer_len > 0) { + ret = write(global_fds[lcore_id], buffer, buffer_len); + if (ret == buffer_len) + return 0; + if (ret == -1) { + if (errno == EINTR) + continue; + return errno; + } + buffer = (char *)buffer + ret; + buffer_len -= ret; + } + return 0; +} + +int rte_power_guest_channel_send_msg(struct channel_packet *pkt, + unsigned int lcore_id) +{ + return guest_channel_send_msg(pkt, lcore_id); +} + + +void +guest_channel_host_disconnect(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return; + } + if (global_fds[lcore_id] == 0) + return; + close(global_fds[lcore_id]); + global_fds[lcore_id] = 0; +} diff --git a/src/spdk/dpdk/lib/librte_power/guest_channel.h b/src/spdk/dpdk/lib/librte_power/guest_channel.h new file mode 100644 index 00000000..373d3989 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/guest_channel.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#ifndef _GUEST_CHANNEL_H +#define _GUEST_CHANNEL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/** + * Connect to the Virtio-Serial VM end-point located in path. It is + * thread safe for unique lcore_ids. This function must be only called once from + * each lcore. + * + * @param path + * The path to the serial device on the filesystem + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int guest_channel_host_connect(const char *path, unsigned int lcore_id); + +/** + * Disconnect from an already connected Virtio-Serial Endpoint. + * + * + * @param lcore_id + * lcore_id. + * + */ +void guest_channel_host_disconnect(unsigned int lcore_id); + +/** + * Send a message contained in pkt over the Virtio-Serial to the host endpoint. + * + * @param pkt + * Pointer to a populated struct guest_agent_pkt + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on channel not connected. + * - errno on write to channel error. + */ +int guest_channel_send_msg(struct channel_packet *pkt, unsigned int lcore_id); + +/** + * Send a message contained in pkt over the Virtio-Serial to the host endpoint. + * + * @param pkt + * Pointer to a populated struct channel_packet + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_guest_channel_send_msg(struct channel_packet *pkt, + unsigned int lcore_id); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/meson.build b/src/spdk/dpdk/lib/librte_power/meson.build new file mode 100644 index 00000000..253173f2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +if host_machine.system() != 'linux' + build = false +endif +sources = files('rte_power.c', 'power_acpi_cpufreq.c', + 'power_kvm_vm.c', 'guest_channel.c') +headers = files('rte_power.h') diff --git a/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c new file mode 100644 index 00000000..cd5978d5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "power_acpi_cpufreq.h" +#include "power_common.h" + +#ifdef RTE_LIBRTE_POWER_DEBUG +#define POWER_DEBUG_TRACE(fmt, args...) do { \ + RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \ +} while (0) +#else +#define POWER_DEBUG_TRACE(fmt, args...) +#endif + +#define FOPEN_OR_ERR_RET(f, retval) do { \ + if ((f) == NULL) { \ + RTE_LOG(ERR, POWER, "File not openned\n"); \ + return retval; \ + } \ +} while (0) + +#define FOPS_OR_NULL_GOTO(ret, label) do { \ + if ((ret) == NULL) { \ + RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \ + goto label; \ + } \ +} while (0) + +#define FOPS_OR_ERR_GOTO(ret, label) do { \ + if ((ret) < 0) { \ + RTE_LOG(ERR, POWER, "File operations failed\n"); \ + goto label; \ + } \ +} while (0) + +#define STR_SIZE 1024 +#define POWER_CONVERT_TO_DECIMAL 10 + +#define POWER_GOVERNOR_USERSPACE "userspace" +#define POWER_SYSFILE_GOVERNOR \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor" +#define POWER_SYSFILE_AVAIL_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_available_frequencies" +#define POWER_SYSFILE_SETSPEED \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_setspeed" + +/* + * MSR related + */ +#define PLATFORM_INFO 0x0CE +#define TURBO_RATIO_LIMIT 0x1AD +#define IA32_PERF_CTL 0x199 +#define CORE_TURBO_DISABLE_BIT ((uint64_t)1<<32) + +enum power_state { + POWER_IDLE = 0, + POWER_ONGOING, + POWER_USED, + POWER_UNKNOWN +}; + +/** + * Power info per lcore. + */ +struct rte_power_info { + unsigned int lcore_id; /**< Logical core id */ + uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */ + uint32_t nb_freqs; /**< number of available freqs */ + FILE *f; /**< FD of scaling_setspeed */ + char governor_ori[32]; /**< Original governor name */ + uint32_t curr_idx; /**< Freq index in freqs array */ + volatile uint32_t state; /**< Power in use state */ + uint16_t turbo_available; /**< Turbo Boost available */ + uint16_t turbo_enable; /**< Turbo Boost enable/disable */ +} __rte_cache_aligned; + +static struct rte_power_info lcore_power_info[RTE_MAX_LCORE]; + +/** + * It is to set specific freq for specific logical core, according to the index + * of supported frequencies. + */ +static int +set_freq_internal(struct rte_power_info *pi, uint32_t idx) +{ + if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Invalid frequency index %u, which " + "should be less than %u\n", idx, pi->nb_freqs); + return -1; + } + + /* Check if it is the same as current */ + if (idx == pi->curr_idx) + return 0; + + POWER_DEBUG_TRACE("Freqency[%u] %u to be set for lcore %u\n", + idx, pi->freqs[idx], pi->lcore_id); + if (fseek(pi->f, 0, SEEK_SET) < 0) { + RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 " + "for setting frequency for lcore %u\n", pi->lcore_id); + return -1; + } + if (fprintf(pi->f, "%u", pi->freqs[idx]) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + fflush(pi->f); + pi->curr_idx = idx; + + return 1; +} + +/** + * It is to check the current scaling governor by reading sys file, and then + * set it into 'userspace' if it is not by writing the sys file. The original + * governor will be saved for rolling back. + */ +static int +power_set_governor_userspace(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Check if current governor is userspace */ + if (strncmp(buf, POWER_GOVERNOR_USERSPACE, + sizeof(POWER_GOVERNOR_USERSPACE)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u is " + "already userspace\n", pi->lcore_id); + goto out; + } + /* Save the original governor */ + snprintf(pi->governor_ori, sizeof(pi->governor_ori), "%s", buf); + + /* Write 'userspace' to the governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(POWER_GOVERNOR_USERSPACE, f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been " + "set to user space successfully\n", pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to get the available frequencies of the specific lcore by reading the + * sys file. + */ +static int +power_get_available_freqs(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1, i, count; + char *p; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *freqs[RTE_MAX_LCORE_FREQS]; + char *s; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_AVAIL_FREQ, + pi->lcore_id); + f = fopen(fullpath, "r"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Strip the line break if there is */ + p = strchr(buf, '\n'); + if (p != NULL) + *p = 0; + + /* Split string into at most RTE_MAX_LCORE_FREQS frequencies */ + count = rte_strsplit(buf, sizeof(buf), freqs, + RTE_MAX_LCORE_FREQS, ' '); + if (count <= 0) { + RTE_LOG(ERR, POWER, "No available frequency in " + ""POWER_SYSFILE_AVAIL_FREQ"\n", pi->lcore_id); + goto out; + } + if (count >= RTE_MAX_LCORE_FREQS) { + RTE_LOG(ERR, POWER, "Too many available frequencies : %d\n", + count); + goto out; + } + + /* Store the available frequncies into power context */ + for (i = 0, pi->nb_freqs = 0; i < count; i++) { + POWER_DEBUG_TRACE("Lcore %u frequency[%d]: %s\n", pi->lcore_id, + i, freqs[i]); + pi->freqs[pi->nb_freqs++] = strtoul(freqs[i], &p, + POWER_CONVERT_TO_DECIMAL); + } + + if ((pi->freqs[0]-1000) == pi->freqs[1]) { + pi->turbo_available = 1; + pi->turbo_enable = 1; + POWER_DEBUG_TRACE("Lcore %u Can do Turbo Boost\n", + pi->lcore_id); + } else { + pi->turbo_available = 0; + pi->turbo_enable = 0; + POWER_DEBUG_TRACE("Turbo Boost not available on Lcore %u\n", + pi->lcore_id); + } + + ret = 0; + POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n", + count, pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to fopen the sys file for the future setting the lcore frequency. + */ +static int +power_init_for_setting_freq(struct rte_power_info *pi) +{ + FILE *f; + char fullpath[PATH_MAX]; + char buf[BUFSIZ]; + uint32_t i, freq; + char *s; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_SETSPEED, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, -1); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + freq = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL); + for (i = 0; i < pi->nb_freqs; i++) { + if (freq == pi->freqs[i]) { + pi->curr_idx = i; + pi->f = f; + return 0; + } + } + +out: + fclose(f); + + return -1; +} + +int +power_acpi_cpufreq_init(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "in use\n", lcore_id); + return -1; + } + + pi->lcore_id = lcore_id; + /* Check and set the governor */ + if (power_set_governor_userspace(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to " + "userspace\n", lcore_id); + goto fail; + } + + /* Get the available frequencies */ + if (power_get_available_freqs(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot get available frequencies of " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Init for setting lcore frequency */ + if (power_init_for_setting_freq(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot init for setting frequency for " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Set freq to max by default */ + if (power_acpi_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u " + "to max\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u " + "power management\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +/** + * It is to check the governor and then set the original governor back if + * needed by writing the sys file. + */ +static int +power_set_governor_original(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Check if the governor to be set is the same as current */ + if (strncmp(buf, pi->governor_ori, sizeof(pi->governor_ori)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u " + "has already been set to %s\n", + pi->lcore_id, pi->governor_ori); + goto out; + } + + /* Write back the original governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(pi->governor_ori, f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u " + "has been set back to %s successfully\n", + pi->lcore_id, pi->governor_ori); +out: + fclose(f); + + return ret; +} + +int +power_acpi_cpufreq_exit(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "not used\n", lcore_id); + return -1; + } + + /* Close FD of setting freq */ + fclose(pi->f); + pi->f = NULL; + + /* Set the governor back to the original */ + if (power_set_governor_original(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set the governor of %u back " + "to the original\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from " + "'userspace' mode and been set back to the " + "original\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +uint32_t +power_acpi_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE || !freqs) { + RTE_LOG(ERR, POWER, "Invalid input parameter\n"); + return 0; + } + + pi = &lcore_power_info[lcore_id]; + if (num < pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Buffer size is not enough\n"); + return 0; + } + rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t)); + + return pi->nb_freqs; +} + +uint32_t +power_acpi_cpufreq_get_freq(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return RTE_POWER_INVALID_FREQ_INDEX; + } + + return lcore_power_info[lcore_id].curr_idx; +} + +int +power_acpi_cpufreq_set_freq(unsigned int lcore_id, uint32_t index) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + return set_freq_internal(&(lcore_power_info[lcore_id]), index); +} + +int +power_acpi_cpufreq_freq_down(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx + 1 == pi->nb_freqs) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx + 1); +} + +int +power_acpi_cpufreq_freq_up(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx == 0) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx - 1); +} + +int +power_acpi_cpufreq_freq_max(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + /* Frequencies in the array are from high to low. */ + if (lcore_power_info[lcore_id].turbo_available) { + if (lcore_power_info[lcore_id].turbo_enable) + /* Set to Turbo */ + return set_freq_internal( + &lcore_power_info[lcore_id], 0); + else + /* Set to max non-turbo */ + return set_freq_internal( + &lcore_power_info[lcore_id], 1); + } else + return set_freq_internal(&lcore_power_info[lcore_id], 0); +} + +int +power_acpi_cpufreq_freq_min(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->nb_freqs - 1); +} + + +int +power_acpi_turbo_status(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + return pi->turbo_enable; +} + + +int +power_acpi_enable_turbo(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + if (pi->turbo_available) + pi->turbo_enable = 1; + else { + pi->turbo_enable = 0; + RTE_LOG(ERR, POWER, + "Failed to enable turbo on lcore %u\n", + lcore_id); + return -1; + } + + /* Max may have changed, so call to max function */ + if (power_acpi_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, + "Failed to set frequency of lcore %u to max\n", + lcore_id); + return -1; + } + + return 0; +} + +int +power_acpi_disable_turbo(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + pi->turbo_enable = 0; + + if ((pi->turbo_available) && (pi->curr_idx <= 1)) { + /* Try to set freq to max by default coming out of turbo */ + if (power_acpi_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, + "Failed to set frequency of lcore %u to max\n", + lcore_id); + return -1; + } + } + + return 0; +} + +int power_acpi_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + if (caps == NULL) { + RTE_LOG(ERR, POWER, "Invalid argument\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + caps->capabilities = 0; + caps->turbo = !!(pi->turbo_available); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h new file mode 100644 index 00000000..1af74160 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _POWER_ACPI_CPUFREQ_H +#define _POWER_ACPI_CPUFREQ_H + +/** + * @file + * RTE Power Management via userspace ACPI cpufreq + */ + +#include +#include +#include +#include +#include "rte_power.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize power management for a specific lcore. It will check and set the + * governor to userspace for the lcore, get the available frequencies, and + * prepare to set new lcore frequency. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_cpufreq_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. It will set the governor to which + * is before initialized. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_cpufreq_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. The return value will be + * the minimal one of the total number of available frequencies and the number + * of buffer. The index of available frequencies used in other interfaces + * should be in the range of 0 to this return value. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * The number of available frequencies. + */ +uint32_t power_acpi_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +/** + * Return the current index of available frequencies of a specific lcore. It + * will return 'RTE_POWER_INVALID_FREQ_INDEX = (~0)' if error. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * The current index of available frequencies. + */ +uint32_t power_acpi_cpufreq_get_freq(unsigned int lcore_id); + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_set_freq(unsigned int lcore_id, uint32_t index); + +/** + * Scale up the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_up(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_down(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_max(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_min(unsigned int lcore_id); + +/** + * Get the turbo status of a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 Turbo Boost is enabled on this lcore. + * - 0 Turbo Boost is disabled on this lcore. + * - Negative on error. + */ +int power_acpi_turbo_status(unsigned int lcore_id); + +/** + * Enable Turbo Boost on a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 Turbo Boost is enabled successfully on this lcore. + * - Negative on error. + */ +int power_acpi_enable_turbo(unsigned int lcore_id); + +/** + * Disable Turbo Boost on a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 Turbo Boost disabled successfully on this lcore. + * - Negative on error. + */ +int power_acpi_disable_turbo(unsigned int lcore_id); + +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/power_common.h b/src/spdk/dpdk/lib/librte_power/power_common.h new file mode 100644 index 00000000..feeb5777 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_common.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _POWER_COMMON_H_ +#define _POWER_COMMON_H_ + +#define RTE_POWER_INVALID_FREQ_INDEX (~0) + +#endif /* _POWER_COMMON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_power/power_kvm_vm.c b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.c new file mode 100644 index 00000000..20659b72 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.c @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include + +#include + +#include "guest_channel.h" +#include "channel_commands.h" +#include "power_kvm_vm.h" +#include "power_common.h" + +#define FD_PATH "/dev/virtio-ports/virtio.serial.port.poweragent" + +static struct channel_packet pkt[CHANNEL_CMDS_MAX_VM_CHANNELS]; + + +int +power_kvm_vm_init(unsigned int lcore_id) +{ + if (lcore_id >= CHANNEL_CMDS_MAX_VM_CHANNELS) { + RTE_LOG(ERR, POWER, "Core(%u) is out of range 0...%d\n", + lcore_id, CHANNEL_CMDS_MAX_VM_CHANNELS-1); + return -1; + } + pkt[lcore_id].command = CPU_POWER; + pkt[lcore_id].resource_id = lcore_id; + return guest_channel_host_connect(FD_PATH, lcore_id); +} + +int +power_kvm_vm_exit(unsigned int lcore_id) +{ + guest_channel_host_disconnect(lcore_id); + return 0; +} + +uint32_t +power_kvm_vm_freqs(__attribute__((unused)) unsigned int lcore_id, + __attribute__((unused)) uint32_t *freqs, + __attribute__((unused)) uint32_t num) +{ + RTE_LOG(ERR, POWER, "rte_power_freqs is not implemented " + "for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +uint32_t +power_kvm_vm_get_freq(__attribute__((unused)) unsigned int lcore_id) +{ + RTE_LOG(ERR, POWER, "rte_power_get_freq is not implemented " + "for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +int +power_kvm_vm_set_freq(__attribute__((unused)) unsigned int lcore_id, + __attribute__((unused)) uint32_t index) +{ + RTE_LOG(ERR, POWER, "rte_power_set_freq is not implemented " + "for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +static inline int +send_msg(unsigned int lcore_id, uint32_t scale_direction) +{ + int ret; + + if (lcore_id >= CHANNEL_CMDS_MAX_VM_CHANNELS) { + RTE_LOG(ERR, POWER, "Core(%u) is out of range 0...%d\n", + lcore_id, CHANNEL_CMDS_MAX_VM_CHANNELS-1); + return -1; + } + pkt[lcore_id].unit = scale_direction; + ret = guest_channel_send_msg(&pkt[lcore_id], lcore_id); + if (ret == 0) + return 1; + RTE_LOG(DEBUG, POWER, "Error sending message: %s\n", + ret > 0 ? strerror(ret) : "channel not connected"); + return -1; +} + +int +power_kvm_vm_freq_up(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_UP); +} + +int +power_kvm_vm_freq_down(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_DOWN); +} + +int +power_kvm_vm_freq_max(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_MAX); +} + +int +power_kvm_vm_freq_min(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_MIN); +} + +int +power_kvm_vm_turbo_status(__attribute__((unused)) unsigned int lcore_id) +{ + RTE_LOG(ERR, POWER, "rte_power_turbo_status is not implemented for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +int +power_kvm_vm_enable_turbo(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_ENABLE_TURBO); +} + +int +power_kvm_vm_disable_turbo(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_DISABLE_TURBO); +} + +struct rte_power_core_capabilities; +int power_kvm_vm_get_capabilities(__rte_unused unsigned int lcore_id, + __rte_unused struct rte_power_core_capabilities *caps) +{ + RTE_LOG(ERR, POWER, "rte_power_get_capabilities is not implemented for Virtual Machine Power Management\n"); + return -ENOTSUP; +} diff --git a/src/spdk/dpdk/lib/librte_power/power_kvm_vm.h b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.h new file mode 100644 index 00000000..94d4aa12 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _POWER_KVM_VM_H +#define _POWER_KVM_VM_H + +/** + * @file + * RTE Power Management KVM VM + */ + +#include +#include +#include +#include +#include "rte_power.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize power management for a specific lcore. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. + * It is not currently supported for VM Power Management. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * -ENOTSUP + */ +uint32_t power_kvm_vm_freqs(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +/** + * Return the current index of available frequencies of a specific lcore. + * It is not currently supported for VM Power Management. + * + * @param lcore_id + * lcore id. + * + * @return + * -ENOTSUP + */ +uint32_t power_kvm_vm_get_freq(unsigned int lcore_id); + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * It is not currently supported for VM Power Management. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * -ENOTSUP + */ +int power_kvm_vm_set_freq(unsigned int lcore_id, uint32_t index); + +/** + * Scale up the frequency of a specific lcore. This request is forwarded to the + * host monitor. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_up(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_down(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_max(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_min(unsigned int lcore_id); + +/** + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * -ENOTSUP + */ +int power_kvm_vm_turbo_status(unsigned int lcore_id); + +/** + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_enable_turbo(unsigned int lcore_id); + +/** + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_disable_turbo(unsigned int lcore_id); + +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/spdk/dpdk/lib/librte_power/rte_power.c b/src/spdk/dpdk/lib/librte_power/rte_power.c new file mode 100644 index 00000000..208b7919 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power.c @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include "rte_power.h" +#include "power_acpi_cpufreq.h" +#include "power_kvm_vm.h" +#include "power_common.h" + +enum power_management_env global_default_env = PM_ENV_NOT_SET; + +volatile uint32_t global_env_cfg_status = 0; + +/* function pointers */ +rte_power_freqs_t rte_power_freqs = NULL; +rte_power_get_freq_t rte_power_get_freq = NULL; +rte_power_set_freq_t rte_power_set_freq = NULL; +rte_power_freq_change_t rte_power_freq_up = NULL; +rte_power_freq_change_t rte_power_freq_down = NULL; +rte_power_freq_change_t rte_power_freq_max = NULL; +rte_power_freq_change_t rte_power_freq_min = NULL; +rte_power_freq_change_t rte_power_turbo_status; +rte_power_freq_change_t rte_power_freq_enable_turbo; +rte_power_freq_change_t rte_power_freq_disable_turbo; +rte_power_get_capabilities_t rte_power_get_capabilities; + +int +rte_power_set_env(enum power_management_env env) +{ + if (rte_atomic32_cmpset(&global_env_cfg_status, 0, 1) == 0) { + return 0; + } + if (env == PM_ENV_ACPI_CPUFREQ) { + rte_power_freqs = power_acpi_cpufreq_freqs; + rte_power_get_freq = power_acpi_cpufreq_get_freq; + rte_power_set_freq = power_acpi_cpufreq_set_freq; + rte_power_freq_up = power_acpi_cpufreq_freq_up; + rte_power_freq_down = power_acpi_cpufreq_freq_down; + rte_power_freq_min = power_acpi_cpufreq_freq_min; + rte_power_freq_max = power_acpi_cpufreq_freq_max; + rte_power_turbo_status = power_acpi_turbo_status; + rte_power_freq_enable_turbo = power_acpi_enable_turbo; + rte_power_freq_disable_turbo = power_acpi_disable_turbo; + rte_power_get_capabilities = power_acpi_get_capabilities; + } else if (env == PM_ENV_KVM_VM) { + rte_power_freqs = power_kvm_vm_freqs; + rte_power_get_freq = power_kvm_vm_get_freq; + rte_power_set_freq = power_kvm_vm_set_freq; + rte_power_freq_up = power_kvm_vm_freq_up; + rte_power_freq_down = power_kvm_vm_freq_down; + rte_power_freq_min = power_kvm_vm_freq_min; + rte_power_freq_max = power_kvm_vm_freq_max; + rte_power_turbo_status = power_kvm_vm_turbo_status; + rte_power_freq_enable_turbo = power_kvm_vm_enable_turbo; + rte_power_freq_disable_turbo = power_kvm_vm_disable_turbo; + rte_power_get_capabilities = power_kvm_vm_get_capabilities; + } else { + RTE_LOG(ERR, POWER, "Invalid Power Management Environment(%d) set\n", + env); + rte_power_unset_env(); + return -1; + } + global_default_env = env; + return 0; + +} + +void +rte_power_unset_env(void) +{ + if (rte_atomic32_cmpset(&global_env_cfg_status, 1, 0) != 0) + global_default_env = PM_ENV_NOT_SET; +} + +enum power_management_env +rte_power_get_env(void) { + return global_default_env; +} + +int +rte_power_init(unsigned int lcore_id) +{ + int ret = -1; + + if (global_default_env == PM_ENV_ACPI_CPUFREQ) { + return power_acpi_cpufreq_init(lcore_id); + } + if (global_default_env == PM_ENV_KVM_VM) { + return power_kvm_vm_init(lcore_id); + } + /* Auto detect Environment */ + RTE_LOG(INFO, POWER, "Attempting to initialise ACPI cpufreq power " + "management...\n"); + ret = power_acpi_cpufreq_init(lcore_id); + if (ret == 0) { + rte_power_set_env(PM_ENV_ACPI_CPUFREQ); + goto out; + } + + RTE_LOG(INFO, POWER, "Attempting to initialise VM power management...\n"); + ret = power_kvm_vm_init(lcore_id); + if (ret == 0) { + rte_power_set_env(PM_ENV_KVM_VM); + goto out; + } + RTE_LOG(ERR, POWER, "Unable to set Power Management Environment for lcore " + "%u\n", lcore_id); +out: + return ret; +} + +int +rte_power_exit(unsigned int lcore_id) +{ + if (global_default_env == PM_ENV_ACPI_CPUFREQ) + return power_acpi_cpufreq_exit(lcore_id); + if (global_default_env == PM_ENV_KVM_VM) + return power_kvm_vm_exit(lcore_id); + + RTE_LOG(ERR, POWER, "Environment has not been set, unable to exit " + "gracefully\n"); + return -1; + +} diff --git a/src/spdk/dpdk/lib/librte_power/rte_power.h b/src/spdk/dpdk/lib/librte_power/rte_power.h new file mode 100644 index 00000000..d70bc0b3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_POWER_H +#define _RTE_POWER_H + +/** + * @file + * RTE Power Management + */ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Power Management Environment State */ +enum power_management_env {PM_ENV_NOT_SET, PM_ENV_ACPI_CPUFREQ, PM_ENV_KVM_VM}; + +/** + * Set the default power management implementation. If this is not called prior + * to rte_power_init(), then auto-detect of the environment will take place. + * It is not thread safe. + * + * @param env + * env. The environment in which to initialise Power Management for. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_set_env(enum power_management_env env); + +/** + * Unset the global environment configuration. + * This can only be called after all threads have completed. + */ +void rte_power_unset_env(void); + +/** + * Get the default power management implementation. + * + * @return + * power_management_env The configured environment. + */ +enum power_management_env rte_power_get_env(void); + +/** + * Initialize power management for a specific lcore. If rte_power_set_env() has + * not been called then an auto-detect of the environment will start and + * initialise the corresponding resources. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. This will call the environment + * dependent exit function. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * The number of available frequencies. + */ +typedef uint32_t (*rte_power_freqs_t)(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +extern rte_power_freqs_t rte_power_freqs; + +/** + * Return the current index of available frequencies of a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * The current index of available frequencies. + */ +typedef uint32_t (*rte_power_get_freq_t)(unsigned int lcore_id); + +extern rte_power_get_freq_t rte_power_get_freq; + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +typedef int (*rte_power_set_freq_t)(unsigned int lcore_id, uint32_t index); + +extern rte_power_set_freq_t rte_power_set_freq; + +/** + * Function pointer definition for generic frequency change functions. Review + * each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +typedef int (*rte_power_freq_change_t)(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore according to the available + * frequencies. + * Review each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_up; + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * Review each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ + +extern rte_power_freq_change_t rte_power_freq_down; + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * Review each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_max; + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_min; + +/** + * Query the Turbo Boost status of a specific lcore. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 Turbo Boost is enabled for this lcore. + * - 0 Turbo Boost is disabled for this lcore. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_turbo_status; + +/** + * Enable Turbo Boost for this lcore. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_enable_turbo; + +/** + * Disable Turbo Boost for this lcore. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_disable_turbo; + +/** + * Power capabilities summary. + */ +struct rte_power_core_capabilities { + RTE_STD_C11 + union { + uint64_t capabilities; + RTE_STD_C11 + struct { + uint64_t turbo:1; /**< Turbo can be enabled. */ + }; + }; +}; + +/** + * Returns power capabilities for a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +typedef int (*rte_power_get_capabilities_t)(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +extern rte_power_get_capabilities_t rte_power_get_capabilities; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/rte_power_version.map b/src/spdk/dpdk/lib/librte_power/rte_power_version.map new file mode 100644 index 00000000..dd587dfb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power_version.map @@ -0,0 +1,35 @@ +DPDK_2.0 { + global: + + rte_power_exit; + rte_power_freq_down; + rte_power_freq_max; + rte_power_freq_min; + rte_power_freq_up; + rte_power_freqs; + rte_power_get_env; + rte_power_get_freq; + rte_power_init; + rte_power_set_env; + rte_power_set_freq; + rte_power_unset_env; + + local: *; +}; + +DPDK_17.11 { + global: + + rte_power_guest_channel_send_msg; + rte_power_freq_disable_turbo; + rte_power_freq_enable_turbo; + rte_power_turbo_status; + +} DPDK_2.0; + +DPDK_18.08 { + global: + + rte_power_get_capabilities; + +} DPDK_17.11; diff --git a/src/spdk/dpdk/lib/librte_rawdev/Makefile b/src/spdk/dpdk/lib/librte_rawdev/Makefile new file mode 100644 index 00000000..addb288d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_rawdev/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2017 NXP + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_rawdev.a + +# library version +LIBABIVER := 1 + +# build flags +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_eal + +# library source files +SRCS-y += rte_rawdev.c + +# export include files +SYMLINK-y-include += rte_rawdev.h +SYMLINK-y-include += rte_rawdev_pmd.h + +# versioning export map +EXPORT_MAP := rte_rawdev_version.map + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_rawdev/meson.build b/src/spdk/dpdk/lib/librte_rawdev/meson.build new file mode 100644 index 00000000..a20fbdc0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_rawdev/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Intel Corporation + +sources = files('rte_rawdev.c') +headers = files('rte_rawdev.h', 'rte_rawdev_pmd.h') diff --git a/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.c b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.c new file mode 100644 index 00000000..62b6b97e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.c @@ -0,0 +1,552 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_rawdev.h" +#include "rte_rawdev_pmd.h" + +/* dynamic log identifier */ +int librawdev_logtype; + +struct rte_rawdev rte_rawdevices[RTE_RAWDEV_MAX_DEVS]; + +struct rte_rawdev *rte_rawdevs = &rte_rawdevices[0]; + +static struct rte_rawdev_global rawdev_globals = { + .nb_devs = 0 +}; + +struct rte_rawdev_global *rte_rawdev_globals = &rawdev_globals; + +/* Raw device, northbound API implementation */ +uint8_t +rte_rawdev_count(void) +{ + return rte_rawdev_globals->nb_devs; +} + +uint16_t +rte_rawdev_get_dev_id(const char *name) +{ + uint16_t i; + + if (!name) + return -EINVAL; + + for (i = 0; i < rte_rawdev_globals->nb_devs; i++) + if ((strcmp(rte_rawdevices[i].name, name) + == 0) && + (rte_rawdevices[i].attached == + RTE_RAWDEV_ATTACHED)) + return i; + return -ENODEV; +} + +int +rte_rawdev_socket_id(uint16_t dev_id) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + return dev->socket_id; +} + +int +rte_rawdev_info_get(uint16_t dev_id, struct rte_rawdev_info *dev_info) +{ + struct rte_rawdev *rawdev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL); + + rawdev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*rawdev->dev_ops->dev_info_get, -ENOTSUP); + (*rawdev->dev_ops->dev_info_get)(rawdev, dev_info->dev_private); + + if (dev_info) { + + dev_info->driver_name = rawdev->driver_name; + dev_info->device = rawdev->device; + } + + return 0; +} + +int +rte_rawdev_configure(uint16_t dev_id, struct rte_rawdev_info *dev_conf) +{ + struct rte_rawdev *dev; + int diag; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL); + + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP); + + if (dev->started) { + RTE_RDEV_ERR( + "device %d must be stopped to allow configuration", dev_id); + return -EBUSY; + } + + /* Configure the device */ + diag = (*dev->dev_ops->dev_configure)(dev, dev_conf->dev_private); + if (diag != 0) + RTE_RDEV_ERR("dev%d dev_configure = %d", dev_id, diag); + else + dev->attached = 1; + + return diag; +} + +int +rte_rawdev_queue_conf_get(uint16_t dev_id, + uint16_t queue_id, + rte_rawdev_obj_t queue_conf) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_def_conf, -ENOTSUP); + (*dev->dev_ops->queue_def_conf)(dev, queue_id, queue_conf); + return 0; +} + +int +rte_rawdev_queue_setup(uint16_t dev_id, + uint16_t queue_id, + rte_rawdev_obj_t queue_conf) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -ENOTSUP); + return (*dev->dev_ops->queue_setup)(dev, queue_id, queue_conf); +} + +int +rte_rawdev_queue_release(uint16_t dev_id, uint16_t queue_id) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP); + return (*dev->dev_ops->queue_release)(dev, queue_id); +} + +uint16_t +rte_rawdev_queue_count(uint16_t dev_id) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_count, -ENOTSUP); + return (*dev->dev_ops->queue_count)(dev); +} + +int +rte_rawdev_get_attr(uint16_t dev_id, + const char *attr_name, + uint64_t *attr_value) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->attr_get, -ENOTSUP); + return (*dev->dev_ops->attr_get)(dev, attr_name, attr_value); +} + +int +rte_rawdev_set_attr(uint16_t dev_id, + const char *attr_name, + const uint64_t attr_value) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->attr_set, -ENOTSUP); + return (*dev->dev_ops->attr_set)(dev, attr_name, attr_value); +} + +int +rte_rawdev_enqueue_buffers(uint16_t dev_id, + struct rte_rawdev_buf **buffers, + unsigned int count, + rte_rawdev_obj_t context) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->enqueue_bufs, -ENOTSUP); + return (*dev->dev_ops->enqueue_bufs)(dev, buffers, count, context); +} + +int +rte_rawdev_dequeue_buffers(uint16_t dev_id, + struct rte_rawdev_buf **buffers, + unsigned int count, + rte_rawdev_obj_t context) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dequeue_bufs, -ENOTSUP); + return (*dev->dev_ops->dequeue_bufs)(dev, buffers, count, context); +} + +int +rte_rawdev_dump(uint16_t dev_id, FILE *f) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dump, -ENOTSUP); + return (*dev->dev_ops->dump)(dev, f); +} + +static int +xstats_get_count(uint16_t dev_id) +{ + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP); + return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0); +} + +int +rte_rawdev_xstats_names_get(uint16_t dev_id, + struct rte_rawdev_xstats_name *xstats_names, + unsigned int size) +{ + const struct rte_rawdev *dev; + int cnt_expected_entries; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -ENODEV); + + cnt_expected_entries = xstats_get_count(dev_id); + + if (xstats_names == NULL || cnt_expected_entries < 0 || + (int)size < cnt_expected_entries || size <= 0) + return cnt_expected_entries; + + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP); + return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size); +} + +/* retrieve rawdev extended statistics */ +int +rte_rawdev_xstats_get(uint16_t dev_id, + const unsigned int ids[], + uint64_t values[], + unsigned int n) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -ENODEV); + const struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -ENOTSUP); + return (*dev->dev_ops->xstats_get)(dev, ids, values, n); +} + +uint64_t +rte_rawdev_xstats_by_name_get(uint16_t dev_id, + const char *name, + unsigned int *id) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, 0); + const struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + unsigned int temp = -1; + + if (id != NULL) + *id = (unsigned int)-1; + else + id = &temp; /* driver never gets a NULL value */ + + /* implemented by driver */ + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_by_name, -ENOTSUP); + return (*dev->dev_ops->xstats_get_by_name)(dev, name, id); +} + +int +rte_rawdev_xstats_reset(uint16_t dev_id, + const uint32_t ids[], uint32_t nb_ids) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -ENOTSUP); + return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids); +} + +int +rte_rawdev_firmware_status_get(uint16_t dev_id, rte_rawdev_obj_t status_info) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->firmware_status_get, -ENOTSUP); + return (*dev->dev_ops->firmware_status_get)(dev, status_info); +} + +int +rte_rawdev_firmware_version_get(uint16_t dev_id, rte_rawdev_obj_t version_info) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->firmware_version_get, -ENOTSUP); + return (*dev->dev_ops->firmware_version_get)(dev, version_info); +} + +int +rte_rawdev_firmware_load(uint16_t dev_id, rte_rawdev_obj_t firmware_image) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + if (!firmware_image) + return -EINVAL; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->firmware_load, -ENOTSUP); + return (*dev->dev_ops->firmware_load)(dev, firmware_image); +} + +int +rte_rawdev_firmware_unload(uint16_t dev_id) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->firmware_load, -ENOTSUP); + return (*dev->dev_ops->firmware_unload)(dev); +} + +int +rte_rawdev_selftest(uint16_t dev_id) +{ + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + struct rte_rawdev *dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP); + return (*dev->dev_ops->dev_selftest)(); +} + +int +rte_rawdev_start(uint16_t dev_id) +{ + struct rte_rawdev *dev; + int diag; + + RTE_RDEV_DEBUG("Start dev_id=%" PRIu8, dev_id); + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_start, -ENOTSUP); + + if (dev->started != 0) { + RTE_RDEV_ERR("Device with dev_id=%" PRIu8 "already started", + dev_id); + return 0; + } + + diag = (*dev->dev_ops->dev_start)(dev); + if (diag == 0) + dev->started = 1; + else + return diag; + + return 0; +} + +void +rte_rawdev_stop(uint16_t dev_id) +{ + struct rte_rawdev *dev; + + RTE_RDEV_DEBUG("Stop dev_id=%" PRIu8, dev_id); + + RTE_RAWDEV_VALID_DEVID_OR_RET(dev_id); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_stop); + + if (dev->started == 0) { + RTE_RDEV_ERR("Device with dev_id=%" PRIu8 "already stopped", + dev_id); + return; + } + + (*dev->dev_ops->dev_stop)(dev); + dev->started = 0; +} + +int +rte_rawdev_close(uint16_t dev_id) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP); + /* Device must be stopped before it can be closed */ + if (dev->started == 1) { + RTE_RDEV_ERR("Device %u must be stopped before closing", + dev_id); + return -EBUSY; + } + + return (*dev->dev_ops->dev_close)(dev); +} + +int +rte_rawdev_reset(uint16_t dev_id) +{ + struct rte_rawdev *dev; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP); + /* Reset is not dependent on state of the device */ + return (*dev->dev_ops->dev_reset)(dev); +} + +static inline uint8_t +rte_rawdev_find_free_device_index(void) +{ + uint16_t dev_id; + + for (dev_id = 0; dev_id < RTE_RAWDEV_MAX_DEVS; dev_id++) { + if (rte_rawdevs[dev_id].attached == + RTE_RAWDEV_DETACHED) + return dev_id; + } + + return RTE_RAWDEV_MAX_DEVS; +} + +struct rte_rawdev * +rte_rawdev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id) +{ + struct rte_rawdev *rawdev; + uint16_t dev_id; + + if (rte_rawdev_pmd_get_named_dev(name) != NULL) { + RTE_RDEV_ERR("Event device with name %s already allocated!", + name); + return NULL; + } + + dev_id = rte_rawdev_find_free_device_index(); + if (dev_id == RTE_RAWDEV_MAX_DEVS) { + RTE_RDEV_ERR("Reached maximum number of raw devices"); + return NULL; + } + + rawdev = &rte_rawdevs[dev_id]; + + rawdev->dev_private = rte_zmalloc_socket("rawdev private", + dev_priv_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (!rawdev->dev_private) { + RTE_RDEV_ERR("Unable to allocate memory to Skeleton dev"); + return NULL; + } + + + rawdev->dev_id = dev_id; + rawdev->socket_id = socket_id; + rawdev->started = 0; + snprintf(rawdev->name, RTE_RAWDEV_NAME_MAX_LEN, "%s", name); + + rawdev->attached = RTE_RAWDEV_ATTACHED; + rawdev_globals.nb_devs++; + + return rawdev; +} + +int +rte_rawdev_pmd_release(struct rte_rawdev *rawdev) +{ + int ret; + + if (rawdev == NULL) + return -EINVAL; + + ret = rte_rawdev_close(rawdev->dev_id); + if (ret < 0) + return ret; + + rawdev->attached = RTE_RAWDEV_DETACHED; + rawdev_globals.nb_devs--; + + rawdev->dev_id = 0; + rawdev->socket_id = 0; + rawdev->dev_ops = NULL; + if (rawdev->dev_private) { + rte_free(rawdev->dev_private); + rawdev->dev_private = NULL; + } + + return 0; +} + +RTE_INIT(librawdev_init_log) +{ + librawdev_logtype = rte_log_register("lib.rawdev"); + if (librawdev_logtype >= 0) + rte_log_set_level(librawdev_logtype, RTE_LOG_INFO); +} diff --git a/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.h b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.h new file mode 100644 index 00000000..684bfdb8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev.h @@ -0,0 +1,610 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP + */ + +#ifndef _RTE_RAWDEV_H_ +#define _RTE_RAWDEV_H_ + +/** + * @file rte_rawdev.h + * + * Generic device abstraction APIs. + * + * This API allow applications to configure and use generic devices having + * no specific type already available in DPDK. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* Rawdevice object - essentially a void to be typecasted by implementation */ +typedef void *rte_rawdev_obj_t; + +/** + * Get the total number of raw devices that have been successfully + * initialised. + * + * @return + * The total number of usable raw devices. + */ +uint8_t +rte_rawdev_count(void); + +/** + * Get the device identifier for the named raw device. + * + * @param name + * Raw device name to select the raw device identifier. + * + * @return + * Returns raw device identifier on success. + * - <0: Failure to find named raw device. + */ +uint16_t +rte_rawdev_get_dev_id(const char *name); + +/** + * Return the NUMA socket to which a device is connected. + * + * @param dev_id + * The identifier of the device. + * @return + * The NUMA socket id to which the device is connected or + * a default of zero if the socket could not be determined. + * -(-EINVAL) dev_id value is out of range. + */ +int +rte_rawdev_socket_id(uint16_t dev_id); + +/** + * Raw device information forward declaration + */ +struct rte_rawdev_info; + +/** + * Retrieve the contextual information of a raw device. + * + * @param dev_id + * The identifier of the device. + * + * @param[out] dev_info + * A pointer to a structure of type *rte_rawdev_info* to be filled with the + * contextual information of the device. + * + * @return + * - 0: Success, driver updates the contextual information of the raw device + * - <0: Error code returned by the driver info get function. + * + */ +int +rte_rawdev_info_get(uint16_t dev_id, struct rte_rawdev_info *dev_info); + +/** + * Configure a raw device. + * + * This function must be invoked first before any other function in the + * API. This function can also be re-invoked when a device is in the + * stopped state. + * + * The caller may use rte_rawdev_info_get() to get the capability of each + * resources available for this raw device. + * + * @param dev_id + * The identifier of the device to configure. + * @param dev_conf + * The raw device configuration structure encapsulated into rte_rawdev_info + * object. + * It is assumed that the opaque object has enough information which the + * driver/implementation can use to configure the device. It is also assumed + * that once the configuration is done, a `queue_id` type field can be used + * to refer to some arbitrary internal representation of a queue. + * + * @return + * - 0: Success, device configured. + * - <0: Error code returned by the driver configuration function. + */ +int +rte_rawdev_configure(uint16_t dev_id, struct rte_rawdev_info *dev_conf); + + +/** + * Retrieve the current configuration information of a raw queue designated + * by its *queue_id* from the raw driver for a raw device. + * + * This function intended to be used in conjunction with rte_raw_queue_setup() + * where caller needs to set up the queue by overriding few default values. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the raw queue to get the configuration information. + * The value must be in the range [0, nb_raw_queues - 1] + * previously supplied to rte_rawdev_configure(). + * @param[out] queue_conf + * The pointer to the default raw queue configuration data. + * @return + * - 0: Success, driver updates the default raw queue configuration data. + * - <0: Error code returned by the driver info get function. + * + * @see rte_raw_queue_setup() + * + */ +int +rte_rawdev_queue_conf_get(uint16_t dev_id, + uint16_t queue_id, + rte_rawdev_obj_t queue_conf); + +/** + * Allocate and set up a raw queue for a raw device. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the raw queue to setup. The value must be in the range + * [0, nb_raw_queues - 1] previously supplied to rte_rawdev_configure(). + * @param queue_conf + * The pointer to the configuration data to be used for the raw queue. + * NULL value is allowed, in which case default configuration used. + * + * @see rte_rawdev_queue_conf_get() + * + * @return + * - 0: Success, raw queue correctly set up. + * - <0: raw queue configuration failed + */ +int +rte_rawdev_queue_setup(uint16_t dev_id, + uint16_t queue_id, + rte_rawdev_obj_t queue_conf); + +/** + * Release and deallocate a raw queue from a raw device. + * + * @param dev_id + * The identifier of the device. + * @param queue_id + * The index of the raw queue to release. The value must be in the range + * [0, nb_raw_queues - 1] previously supplied to rte_rawdev_configure(). + * + * @see rte_rawdev_queue_conf_get() + * + * @return + * - 0: Success, raw queue released. + * - <0: raw queue configuration failed + */ +int +rte_rawdev_queue_release(uint16_t dev_id, uint16_t queue_id); + +/** + * Get the number of raw queues on a specific raw device + * + * @param dev_id + * Raw device identifier. + * @return + * - The number of configured raw queues + */ +uint16_t +rte_rawdev_queue_count(uint16_t dev_id); + +/** + * Start a raw device. + * + * The device start step is the last one and consists of setting the raw + * queues to start accepting the raws and schedules to raw ports. + * + * On success, all basic functions exported by the API (raw enqueue, + * raw dequeue and so on) can be invoked. + * + * @param dev_id + * Raw device identifier + * @return + * - 0: Success, device started. + * < 0: Failure + */ +int +rte_rawdev_start(uint16_t dev_id); + +/** + * Stop a raw device. The device can be restarted with a call to + * rte_rawdev_start() + * + * @param dev_id + * Raw device identifier. + */ +void +rte_rawdev_stop(uint16_t dev_id); + +/** + * Close a raw device. The device cannot be restarted after this call. + * + * @param dev_id + * Raw device identifier + * + * @return + * - 0 on successfully closing device + * - <0 on failure to close device + * - (-EAGAIN) if device is busy + */ +int +rte_rawdev_close(uint16_t dev_id); + +/** + * Reset a raw device. + * This is different from cycle of rte_rawdev_start->rte_rawdev_stop in the + * sense similar to hard or soft reset. + * + * @param dev_id + * Raw device identifiers + * @return + * 0 for sucessful reset, + * !0 for failure in resetting + */ +int +rte_rawdev_reset(uint16_t dev_id); + +#define RTE_RAWDEV_NAME_MAX_LEN (64) +/**< @internal Max length of name of raw PMD */ + + + +/** @internal + * The data structure associated with each raw device. + * It is a placeholder for PMD specific data, encapsulating only information + * related to framework. + */ +struct rte_rawdev { + /**< Socket ID where memory is allocated */ + int socket_id; + /**< Device ID for this instance */ + uint16_t dev_id; + /**< Functions exported by PMD */ + const struct rte_rawdev_ops *dev_ops; + /**< Device info. supplied during device initialization */ + struct rte_device *device; + /**< Driver info. supplied by probing */ + const char *driver_name; + + RTE_STD_C11 + /**< Flag indicating the device is attached */ + uint8_t attached : 1; + /**< Device state: STARTED(1)/STOPPED(0) */ + uint8_t started : 1; + + /**< PMD-specific private data */ + rte_rawdev_obj_t dev_private; + /**< Device name */ + char name[RTE_RAWDEV_NAME_MAX_LEN]; +} __rte_cache_aligned; + +/** @internal The pool of rte_rawdev structures. */ +extern struct rte_rawdev *rte_rawdevs; + + +struct rte_rawdev_info { + /**< Name of driver handling this device */ + const char *driver_name; + /**< Device encapsulation */ + struct rte_device *device; + /**< Socket ID where memory is allocated */ + int socket_id; + /**< PMD-specific private data */ + rte_rawdev_obj_t dev_private; +}; + +struct rte_rawdev_buf { + /**< Opaque buffer reference */ + void *buf_addr; +}; + +/** + * Dump internal information about *dev_id* to the FILE* provided in *f*. + * + * @param dev_id + * The identifier of the device. + * + * @param f + * A pointer to a file for output + * + * @return + * - 0: on success + * - <0: on failure. + */ +int +rte_rawdev_dump(uint16_t dev_id, FILE *f); + +/** + * Get an attribute value from implementation. + * Attribute is an opaque handle agreed upon between application and PMD. + * + * Implementations are expected to maintain an array of attribute-value pairs + * based on application calls. Memory management for this structure is + * shared responsibility of implementation and application. + * + * @param dev_id + * The identifier of the device to configure. + * @param attr_name + * Opaque object representing an attribute in implementation. + * @param attr_value [out] + * Opaque response to the attribute value. In case of error, this remains + * untouched. This is double pointer of void type. + * @return + * 0 for success + * !0 Error; attr_value remains untouched in case of error. + */ +int +rte_rawdev_get_attr(uint16_t dev_id, + const char *attr_name, + uint64_t *attr_value); + +/** + * Set an attribute value. + * Attribute is an opaque handle agreed upon between application and PMD. + * + * @param dev_id + * The identifier of the device to configure. + * @param attr_name + * Opaque object representing an attribute in implementation. + * @param attr_value + * Value of the attribute represented by attr_name + * @return + * 0 for success + * !0 Error + */ +int +rte_rawdev_set_attr(uint16_t dev_id, + const char *attr_name, + const uint64_t attr_value); + +/** + * Enqueue a stream of buffers to the device. + * + * Rather than specifying a queue, this API passes along an opaque object + * to the driver implementation. That object can be a queue or any other + * contextual information necessary for the device to enqueue buffers. + * + * @param dev_id + * The identifier of the device to configure. + * @param buffers + * Collection of buffers for enqueueing + * @param count + * Count of buffers to enqueue + * @param context + * Opaque context information. + * @return + * >=0 for buffers enqueued + * !0 for failure. + * Whether partial enqueue is failure or success is defined between app + * and driver implementation. + */ +int +rte_rawdev_enqueue_buffers(uint16_t dev_id, + struct rte_rawdev_buf **buffers, + unsigned int count, + rte_rawdev_obj_t context); + +/** + * Dequeue a stream of buffers from the device. + * + * Rather than specifying a queue, this API passes along an opaque object + * to the driver implementation. That object can be a queue or any other + * contextual information necessary for the device to dequeue buffers. + * + * Application should have allocated enough space to store `count` response + * buffers. + * Releasing buffers dequeued is responsibility of the application. + * + * @param dev_id + * The identifier of the device to configure. + * @param buffers + * Collection of buffers dequeued + * @param count + * Max buffers expected to be dequeued + * @param context + * Opaque context information. + * @return + * >=0 for buffers dequeued + * !0 for failure. + * Whether partial enqueue is failure or success is defined between app + * and driver implementation. + */ +int +rte_rawdev_dequeue_buffers(uint16_t dev_id, + struct rte_rawdev_buf **buffers, + unsigned int count, + rte_rawdev_obj_t context); + +/** Maximum name length for extended statistics counters */ +#define RTE_RAW_DEV_XSTATS_NAME_SIZE 64 + +/** + * A name-key lookup element for extended statistics. + * + * This structure is used to map between names and ID numbers + * for extended ethdev statistics. + */ +struct rte_rawdev_xstats_name { + char name[RTE_RAW_DEV_XSTATS_NAME_SIZE]; +}; + +/** + * Retrieve names of extended statistics of a raw device. + * + * @param dev_id + * The identifier of the raw device. + * @param[out] xstats_names + * Block of memory to insert names into. Must be at least size in capacity. + * If set to NULL, function returns required capacity. + * @param size + * Capacity of xstats_names (number of names). + * @return + * - positive value lower or equal to size: success. The return value + * is the number of entries filled in the stats table. + * - positive value higher than size: error, the given statistics table + * is too small. The return value corresponds to the size that should + * be given to succeed. The entries in the table are not valid and + * shall not be used by the caller. + * - negative value on error: + * -ENODEV for invalid *dev_id* + * -ENOTSUP if the device doesn't support this function. + */ +int +rte_rawdev_xstats_names_get(uint16_t dev_id, + struct rte_rawdev_xstats_name *xstats_names, + unsigned int size); + +/** + * Retrieve extended statistics of a raw device. + * + * @param dev_id + * The identifier of the device. + * @param ids + * The id numbers of the stats to get. The ids can be got from the stat + * position in the stat list from rte_rawdev_get_xstats_names(), or + * by using rte_rawdev_get_xstats_by_name() + * @param[out] values + * The values for each stats request by ID. + * @param n + * The number of stats requested + * @return + * - positive value: number of stat entries filled into the values array + * - negative value on error: + * -ENODEV for invalid *dev_id* + * -ENOTSUP if the device doesn't support this function. + */ +int +rte_rawdev_xstats_get(uint16_t dev_id, + const unsigned int ids[], + uint64_t values[], + unsigned int n); + +/** + * Retrieve the value of a single stat by requesting it by name. + * + * @param dev_id + * The identifier of the device + * @param name + * The stat name to retrieve + * @param[out] id + * If non-NULL, the numerical id of the stat will be returned, so that further + * requests for the stat can be got using rte_rawdev_xstats_get, which will + * be faster as it doesn't need to scan a list of names for the stat. + * If the stat cannot be found, the id returned will be (unsigned)-1. + * @return + * - positive value or zero: the stat value + * - negative value: -EINVAL if stat not found, -ENOTSUP if not supported. + */ +uint64_t +rte_rawdev_xstats_by_name_get(uint16_t dev_id, + const char *name, + unsigned int *id); + +/** + * Reset the values of the xstats of the selected component in the device. + * + * @param dev_id + * The identifier of the device + * @param ids + * Selects specific statistics to be reset. When NULL, all statistics + * will be reset. If non-NULL, must point to array of at least + * *nb_ids* size. + * @param nb_ids + * The number of ids available from the *ids* array. Ignored when ids is NULL. + * @return + * - zero: successfully reset the statistics to zero + * - negative value: -EINVAL invalid parameters, -ENOTSUP if not supported. + */ +int +rte_rawdev_xstats_reset(uint16_t dev_id, + const uint32_t ids[], + uint32_t nb_ids); + +/** + * Get Firmware status of the device.. + * Returns a memory allocated by driver/implementation containing status + * information block. It is responsibility of caller to release the buffer. + * + * @param dev_id + * Raw device identifier + * @param status_info + * Pointer to status information area. Caller is responsible for releasing + * the memory associated. + * @return + * 0 for success, + * !0 for failure, `status_info` argument state is undefined + */ +int +rte_rawdev_firmware_status_get(uint16_t dev_id, + rte_rawdev_obj_t status_info); + +/** + * Get Firmware version of the device. + * Returns a memory allocated by driver/implementation containing version + * information block. It is responsibility of caller to release the buffer. + * + * @param dev_id + * Raw device identifier + * @param version_info + * Pointer to version information area. Caller is responsible for releasing + * the memory associated. + * @return + * 0 for success, + * !0 for failure, `version_info` argument state is undefined + */ +int +rte_rawdev_firmware_version_get(uint16_t dev_id, + rte_rawdev_obj_t version_info); + +/** + * Load firmware on the device. + * TODO: In future, methods like directly flashing from file too can be + * supported. + * + * @param dev_id + * Raw device identifier + * @param firmware_image + * Pointer to buffer containing image binary data + * @return + * 0 for successful load + * !0 for failure to load the provided image, or image incorrect. + */ +int +rte_rawdev_firmware_load(uint16_t dev_id, rte_rawdev_obj_t firmware_image); + +/** + * Unload firmware from the device. + * + * @param dev_id + * Raw device identifiers + * @return + * 0 for successful Unload + * !0 for failure in unloading + */ +int +rte_rawdev_firmware_unload(uint16_t dev_id); + +/** + * Trigger the rawdev self test. + * + * @param dev_id + * The identifier of the device + * @return + * - 0: Selftest successful + * - -ENOTSUP if the device doesn't support selftest + * - other values < 0 on failure. + */ +int +rte_rawdev_selftest(uint16_t dev_id); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RAWDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_pmd.h b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_pmd.h new file mode 100644 index 00000000..bb9bbc35 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_pmd.h @@ -0,0 +1,627 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP + */ + +#ifndef _RTE_RAWDEV_PMD_H_ +#define _RTE_RAWDEV_PMD_H_ + +/** @file + * RTE RAW PMD APIs + * + * @note + * Driver facing APIs for a raw device. These are not to be called directly by + * any application. + * + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include +#include + +#include "rte_rawdev.h" + +extern int librawdev_logtype; + +/* Logging Macros */ +#define RTE_RDEV_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ## level, librawdev_logtype, "%s(): " fmt "\n", \ + __func__, ##args) + +#define RTE_RDEV_ERR(fmt, args...) \ + RTE_RDEV_LOG(ERR, fmt, ## args) +#define RTE_RDEV_DEBUG(fmt, args...) \ + RTE_RDEV_LOG(DEBUG, fmt, ## args) +#define RTE_RDEV_INFO(fmt, args...) \ + RTE_RDEV_LOG(INFO, fmt, ## args) + + +/* Macros to check for valid device */ +#define RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \ + if (!rte_rawdev_pmd_is_valid_dev((dev_id))) { \ + RTE_RDEV_ERR("Invalid dev_id=%d", dev_id); \ + return retval; \ + } \ +} while (0) + +#define RTE_RAWDEV_VALID_DEVID_OR_RET(dev_id) do { \ + if (!rte_rawdev_pmd_is_valid_dev((dev_id))) { \ + RTE_RDEV_ERR("Invalid dev_id=%d", dev_id); \ + return; \ + } \ +} while (0) + +#define RTE_RAWDEV_DETACHED (0) +#define RTE_RAWDEV_ATTACHED (1) + +/* Global structure used for maintaining state of allocated raw devices. + * + * TODO: Can be expanded to : in future. + * Applications should be able to select from a number of type of raw + * devices which were detected or attached to this DPDK instance. + */ +struct rte_rawdev_global { + /**< Number of devices found */ + uint16_t nb_devs; +}; + +extern struct rte_rawdev_global *rte_rawdev_globals; +/** Pointer to global raw devices data structure. */ +extern struct rte_rawdev *rte_rawdevs; +/** The pool of rte_rawdev structures. */ + +/** + * Get the rte_rawdev structure device pointer for the named device. + * + * @param name + * device name to select the device structure. + * + * @return + * - The rte_rawdev structure pointer for the given device ID. + */ +static inline struct rte_rawdev * +rte_rawdev_pmd_get_named_dev(const char *name) +{ + struct rte_rawdev *dev; + unsigned int i; + + if (name == NULL) + return NULL; + + for (i = 0; i < RTE_RAWDEV_MAX_DEVS; i++) { + dev = &rte_rawdevs[i]; + if ((dev->attached == RTE_RAWDEV_ATTACHED) && + (strcmp(dev->name, name) == 0)) + return dev; + } + + return NULL; +} + +/** + * Validate if the raw device index is a valid attached raw device. + * + * @param dev_id + * raw device index. + * + * @return + * - If the device index is valid (1) or not (0). + */ +static inline unsigned +rte_rawdev_pmd_is_valid_dev(uint8_t dev_id) +{ + struct rte_rawdev *dev; + + if (dev_id >= RTE_RAWDEV_MAX_DEVS) + return 0; + + dev = &rte_rawdevs[dev_id]; + if (dev->attached != RTE_RAWDEV_ATTACHED) + return 0; + else + return 1; +} + +/** + * Definitions of all functions exported by a driver through the + * the generic structure of type *rawdev_ops* supplied in the + * *rte_rawdev* structure associated with a device. + */ + +/** + * Get device information of a device. + * + * @param dev + * Raw device pointer + * @param dev_info + * Raw device information structure + * + * @return + * Returns 0 on success + */ +typedef void (*rawdev_info_get_t)(struct rte_rawdev *dev, + rte_rawdev_obj_t dev_info); + +/** + * Configure a device. + * + * @param dev + * Raw device pointer + * @param config + * Void object containing device specific configuration + * + * @return + * Returns 0 on success + */ +typedef int (*rawdev_configure_t)(const struct rte_rawdev *dev, + rte_rawdev_obj_t config); + +/** + * Start a configured device. + * + * @param dev + * Raw device pointer + * + * @return + * Returns 0 on success + */ +typedef int (*rawdev_start_t)(struct rte_rawdev *dev); + +/** + * Stop a configured device. + * + * @param dev + * Raw device pointer + */ +typedef void (*rawdev_stop_t)(struct rte_rawdev *dev); + +/** + * Close a configured device. + * + * @param dev + * Raw device pointer + * + * @return + * - 0 on success + * - (-EAGAIN) if can't close as device is busy + */ +typedef int (*rawdev_close_t)(struct rte_rawdev *dev); + +/** + * Reset a configured device. + * + * @param dev + * Raw device pointer + * @return + * 0 for success + * !0 for failure + */ +typedef int (*rawdev_reset_t)(struct rte_rawdev *dev); + +/** + * Retrieve the current raw queue configuration. + * + * @param dev + * Raw device pointer + * @param queue_id + * Raw device queue index + * @param[out] queue_conf + * Raw device queue configuration structure + * + */ +typedef void (*rawdev_queue_conf_get_t)(struct rte_rawdev *dev, + uint16_t queue_id, + rte_rawdev_obj_t queue_conf); + +/** + * Setup an raw queue. + * + * @param dev + * Raw device pointer + * @param queue_id + * Rawqueue index + * @param queue_conf + * Rawqueue configuration structure + * + * @return + * Returns 0 on success. + */ +typedef int (*rawdev_queue_setup_t)(struct rte_rawdev *dev, + uint16_t queue_id, + rte_rawdev_obj_t queue_conf); + +/** + * Release resources allocated by given raw queue. + * + * @param dev + * Raw device pointer + * @param queue_id + * Raw queue index + * + */ +typedef int (*rawdev_queue_release_t)(struct rte_rawdev *dev, + uint16_t queue_id); + +/** + * Get the count of number of queues configured on this device. + * + * Another way to fetch this information is to fetch the device configuration. + * But, that assumes that the device configuration managed by the driver has + * that kind of information. + * + * This function helps in getting queue count supported, independently. It + * can help in cases where iterator needs to be implemented. + * + * @param + * Raw device pointer + * @return + * Number of queues; 0 is assumed to be a valid response. + * + */ +typedef uint16_t (*rawdev_queue_count_t)(struct rte_rawdev *dev); + +/** + * Enqueue an array of raw buffers to the device. + * + * Buffer being used is opaque - it can be obtained from mempool or from + * any other source. Interpretation of buffer is responsibility of driver. + * + * @param dev + * Raw device pointer + * @param bufs + * array of buffers + * @param count + * number of buffers passed + * @param context + * an opaque object representing context of the call; for example, an + * application can pass information about the queues on which enqueue needs + * to be done. Or, the enqueue operation might be passed reference to an + * object containing a callback (agreed upon between applicatio and driver). + * + * @return + * >=0 Count of buffers successfully enqueued (0: no buffers enqueued) + * <0 Error count in case of error + */ +typedef int (*rawdev_enqueue_bufs_t)(struct rte_rawdev *dev, + struct rte_rawdev_buf **buffers, + unsigned int count, + rte_rawdev_obj_t context); + +/** + * Dequeue an array of raw buffers from the device. + * + * @param dev + * Raw device pointer + * @param bufs + * array of buffers + * @param count + * Max buffers expected to be dequeued + * @param context + * an opaque object representing context of the call. Based on this object, + * the application and driver can coordinate for dequeue operation involving + * agreed upon semantics. For example, queue information/id on which Dequeue + * needs to be performed. + * @return + * >0, ~0: Count of buffers returned + * <0: Error + * Whether short dequeue is success or failure is decided between app and + * driver. + */ +typedef int (*rawdev_dequeue_bufs_t)(struct rte_rawdev *dev, + struct rte_rawdev_buf **buffers, + unsigned int count, + rte_rawdev_obj_t context); + +/** + * Dump internal information + * + * @param dev + * Raw device pointer + * @param f + * A pointer to a file for output + * @return + * 0 for success, + * !0 Error + * + */ +typedef int (*rawdev_dump_t)(struct rte_rawdev *dev, FILE *f); + +/** + * Get an attribute value from implementation. + * Attribute is an opaque handle agreed upon between application and PMD. + * + * @param dev + * Raw device pointer + * @param attr_name + * Opaque object representing an attribute in implementation. + * @param attr_value [out] + * Opaque response to the attribute value. In case of error, this remains + * untouched. This is double pointer of void type. + * @return + * 0 for success + * !0 Error; attr_value remains untouched in case of error. + */ +typedef int (*rawdev_get_attr_t)(struct rte_rawdev *dev, + const char *attr_name, + uint64_t *attr_value); + +/** + * Set an attribute value. + * Attribute is an opaque handle agreed upon between application and PMD. + * + * @param dev + * Raw device pointer + * @param attr_name + * Opaque object representing an attribute in implementation. + * @param attr_value + * Value of the attribute represented by attr_name + * @return + * 0 for success + * !0 Error + */ +typedef int (*rawdev_set_attr_t)(struct rte_rawdev *dev, + const char *attr_name, + const uint64_t attr_value); + +/** + * Retrieve a set of statistics from device. + * Note: Being a raw device, the stats are specific to the device being + * implemented thus represented as xstats. + * + * @param dev + * Raw device pointer + * @param ids + * The stat ids to retrieve + * @param values + * The returned stat values + * @param n + * The number of id values and entries in the values array + * @return + * The number of stat values successfully filled into the values array + */ +typedef int (*rawdev_xstats_get_t)(const struct rte_rawdev *dev, + const unsigned int ids[], uint64_t values[], unsigned int n); + +/** + * Resets the statistic values in xstats for the device. + */ +typedef int (*rawdev_xstats_reset_t)(struct rte_rawdev *dev, + const uint32_t ids[], + uint32_t nb_ids); + +/** + * Get names of extended stats of an raw device + * + * @param dev + * Raw device pointer + * @param xstats_names + * Array of name values to be filled in + * @param size + * Number of values in the xstats_names array + * @return + * When size >= the number of stats, return the number of stat values filled + * into the array. + * When size < the number of available stats, return the number of stats + * values, and do not fill in any data into xstats_names. + */ +typedef int (*rawdev_xstats_get_names_t)(const struct rte_rawdev *dev, + struct rte_rawdev_xstats_name *xstats_names, + unsigned int size); + +/** + * Get value of one stats and optionally return its id + * + * @param dev + * Raw device pointer + * @param name + * The name of the stat to retrieve + * @param id + * Pointer to an unsigned int where we store the stat-id. + * This pointer may be null if the id is not required. + * @return + * The value of the stat, or (uint64_t)-1 if the stat is not found. + * If the stat is not found, the id value will be returned as (unsigned)-1, + * if id pointer is non-NULL + */ +typedef uint64_t (*rawdev_xstats_get_by_name_t)(const struct rte_rawdev *dev, + const char *name, + unsigned int *id); + +/** + * Get firmware/device-stack status. + * Implementation to allocate buffer for returning information. + * + * @param dev + * Raw device pointer + * @param status + * void block containing device specific status information + * @return + * 0 for success, + * !0 for failure, with undefined value in `status_info` + */ +typedef int (*rawdev_firmware_status_get_t)(struct rte_rawdev *dev, + rte_rawdev_obj_t status_info); + +/** + * Get firmware version information + * + * @param dev + * Raw device pointer + * @param version_info + * void pointer to version information returned by device + * @return + * 0 for success, + * !0 for failure, with undefined value in `version_info` + */ +typedef int (*rawdev_firmware_version_get_t)(struct rte_rawdev *dev, + rte_rawdev_obj_t version_info); + +/** + * Load firwmare from a buffer (DMA'able) + * + * @param dev + * Raw device pointer + * @param firmware_file + * file pointer to firmware area + * @return + * >0, ~0: for successful load + * <0: for failure + * + * @see Application may use 'firmware_version_get` for ascertaining successful + * load + */ +typedef int (*rawdev_firmware_load_t)(struct rte_rawdev *dev, + rte_rawdev_obj_t firmware_buf); + +/** + * Unload firwmare + * + * @param dev + * Raw device pointer + * @return + * >0, ~0 for successful unloading + * <0 for failure in unloading + * + * Note: Application can use the `firmware_status_get` or + * `firmware_version_get` to get result of unload. + */ +typedef int (*rawdev_firmware_unload_t)(struct rte_rawdev *dev); + +/** + * Start rawdev selftest + * + * @return + * Return 0 on success + */ +typedef int (*rawdev_selftest_t)(void); + +/** Rawdevice operations function pointer table */ +struct rte_rawdev_ops { + /**< Get device info. */ + rawdev_info_get_t dev_info_get; + /**< Configure device. */ + rawdev_configure_t dev_configure; + /**< Start device. */ + rawdev_start_t dev_start; + /**< Stop device. */ + rawdev_stop_t dev_stop; + /**< Close device. */ + rawdev_close_t dev_close; + /**< Reset device. */ + rawdev_reset_t dev_reset; + + /**< Get raw queue configuration. */ + rawdev_queue_conf_get_t queue_def_conf; + /**< Set up an raw queue. */ + rawdev_queue_setup_t queue_setup; + /**< Release an raw queue. */ + rawdev_queue_release_t queue_release; + /**< Get the number of queues attached to the device */ + rawdev_queue_count_t queue_count; + + /**< Enqueue an array of raw buffers to device. */ + rawdev_enqueue_bufs_t enqueue_bufs; + /**< Dequeue an array of raw buffers from device. */ + /** TODO: Callback based enqueue and dequeue support */ + rawdev_dequeue_bufs_t dequeue_bufs; + + /* Dump internal information */ + rawdev_dump_t dump; + + /**< Get an attribute managed by the implementation */ + rawdev_get_attr_t attr_get; + /**< Set an attribute managed by the implementation */ + rawdev_set_attr_t attr_set; + + /**< Get extended device statistics. */ + rawdev_xstats_get_t xstats_get; + /**< Get names of extended stats. */ + rawdev_xstats_get_names_t xstats_get_names; + /**< Get one value by name. */ + rawdev_xstats_get_by_name_t xstats_get_by_name; + /**< Reset the statistics values in xstats. */ + rawdev_xstats_reset_t xstats_reset; + + /**< Obtainer firmware status */ + rawdev_firmware_status_get_t firmware_status_get; + /**< Obtain firmware version information */ + rawdev_firmware_version_get_t firmware_version_get; + /**< Load firmware */ + rawdev_firmware_load_t firmware_load; + /**< Unload firmware */ + rawdev_firmware_unload_t firmware_unload; + + /**< Device selftest function */ + rawdev_selftest_t dev_selftest; +}; + +/** + * Allocates a new rawdev slot for an raw device and returns the pointer + * to that slot for the driver to use. + * + * @param name + * Unique identifier name for each device + * @param dev_private_size + * Private data allocated within rte_rawdev object. + * @param socket_id + * Socket to allocate resources on. + * @return + * - Slot in the rte_dev_devices array for a new device; + */ +struct rte_rawdev * +rte_rawdev_pmd_allocate(const char *name, size_t dev_private_size, + int socket_id); + +/** + * Release the specified rawdev device. + * + * @param rawdev + * The *rawdev* pointer is the address of the *rte_rawdev* structure. + * @return + * - 0 on success, negative on error + */ +int +rte_rawdev_pmd_release(struct rte_rawdev *rawdev); + +/** + * Creates a new raw device and returns the pointer to that device. + * + * @param name + * Pointer to a character array containing name of the device + * @param dev_private_size + * Size of raw PMDs private data + * @param socket_id + * Socket to allocate resources on. + * + * @return + * - Raw device pointer if device is successfully created. + * - NULL if device cannot be created. + */ +struct rte_rawdev * +rte_rawdev_pmd_init(const char *name, size_t dev_private_size, + int socket_id); + +/** + * Destroy a raw device + * + * @param name + * Name of the device + * @return + * - 0 on success, negative on error + */ +int +rte_rawdev_pmd_uninit(const char *name); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RAWDEV_PMD_H_ */ diff --git a/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_version.map b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_version.map new file mode 100644 index 00000000..b61dbff1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_rawdev/rte_rawdev_version.map @@ -0,0 +1,35 @@ +DPDK_18.08 { + global: + + rte_rawdev_close; + rte_rawdev_configure; + rte_rawdev_count; + rte_rawdev_dequeue_buffers; + rte_rawdev_enqueue_buffers; + rte_rawdev_firmware_load; + rte_rawdev_firmware_status_get; + rte_rawdev_firmware_unload; + rte_rawdev_firmware_version_get; + rte_rawdev_get_attr; + rte_rawdev_get_dev_id; + rte_rawdev_info_get; + rte_rawdev_pmd_allocate; + rte_rawdev_pmd_release; + rte_rawdev_queue_conf_get; + rte_rawdev_queue_count; + rte_rawdev_queue_setup; + rte_rawdev_queue_release; + rte_rawdev_reset; + rte_rawdev_selftest; + rte_rawdev_set_attr; + rte_rawdev_socket_id; + rte_rawdev_start; + rte_rawdev_stop; + rte_rawdev_xstats_by_name_get; + rte_rawdev_xstats_get; + rte_rawdev_xstats_names_get; + rte_rawdev_xstats_reset; + rte_rawdevs; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_reorder/Makefile b/src/spdk/dpdk/lib/librte_reorder/Makefile new file mode 100644 index 00000000..f19c624b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_reorder/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_reorder.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf + +EXPORT_MAP := rte_reorder_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_REORDER) := rte_reorder.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_REORDER)-include := rte_reorder.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_reorder/meson.build b/src/spdk/dpdk/lib/librte_reorder/meson.build new file mode 100644 index 00000000..03aed53d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_reorder/meson.build @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_reorder.c') +headers = files('rte_reorder.h') +deps += ['mbuf'] diff --git a/src/spdk/dpdk/lib/librte_reorder/rte_reorder.c b/src/spdk/dpdk/lib/librte_reorder/rte_reorder.c new file mode 100644 index 00000000..ecf539d2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_reorder/rte_reorder.c @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_reorder.h" + +TAILQ_HEAD(rte_reorder_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_reorder_tailq = { + .name = "RTE_REORDER", +}; +EAL_REGISTER_TAILQ(rte_reorder_tailq) + +#define NO_FLAGS 0 +#define RTE_REORDER_PREFIX "RO_" +#define RTE_REORDER_NAMESIZE 32 + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_REORDER RTE_LOGTYPE_USER1 + +/* A generic circular buffer */ +struct cir_buffer { + unsigned int size; /**< Number of entries that can be stored */ + unsigned int mask; /**< [buffer_size - 1]: used for wrap-around */ + unsigned int head; /**< insertion point in buffer */ + unsigned int tail; /**< extraction point in buffer */ + struct rte_mbuf **entries; +} __rte_cache_aligned; + +/* The reorder buffer data structure itself */ +struct rte_reorder_buffer { + char name[RTE_REORDER_NAMESIZE]; + uint32_t min_seqn; /**< Lowest seq. number that can be in the buffer */ + unsigned int memsize; /**< memory area size of reorder buffer */ + struct cir_buffer ready_buf; /**< temp buffer for dequeued entries */ + struct cir_buffer order_buf; /**< buffer used to reorder entries */ + int is_initialized; +} __rte_cache_aligned; + +static void +rte_reorder_free_mbufs(struct rte_reorder_buffer *b); + +struct rte_reorder_buffer * +rte_reorder_init(struct rte_reorder_buffer *b, unsigned int bufsize, + const char *name, unsigned int size) +{ + const unsigned int min_bufsize = sizeof(*b) + + (2 * size * sizeof(struct rte_mbuf *)); + + if (b == NULL) { + RTE_LOG(ERR, REORDER, "Invalid reorder buffer parameter:" + " NULL\n"); + rte_errno = EINVAL; + return NULL; + } + if (!rte_is_power_of_2(size)) { + RTE_LOG(ERR, REORDER, "Invalid reorder buffer size" + " - Not a power of 2\n"); + rte_errno = EINVAL; + return NULL; + } + if (name == NULL) { + RTE_LOG(ERR, REORDER, "Invalid reorder buffer name ptr:" + " NULL\n"); + rte_errno = EINVAL; + return NULL; + } + if (bufsize < min_bufsize) { + RTE_LOG(ERR, REORDER, "Invalid reorder buffer memory size: %u, " + "minimum required: %u\n", bufsize, min_bufsize); + rte_errno = EINVAL; + return NULL; + } + + memset(b, 0, bufsize); + snprintf(b->name, sizeof(b->name), "%s", name); + b->memsize = bufsize; + b->order_buf.size = b->ready_buf.size = size; + b->order_buf.mask = b->ready_buf.mask = size - 1; + b->ready_buf.entries = (void *)&b[1]; + b->order_buf.entries = RTE_PTR_ADD(&b[1], + size * sizeof(b->ready_buf.entries[0])); + + return b; +} + +struct rte_reorder_buffer* +rte_reorder_create(const char *name, unsigned socket_id, unsigned int size) +{ + struct rte_reorder_buffer *b = NULL; + struct rte_tailq_entry *te; + struct rte_reorder_list *reorder_list; + const unsigned int bufsize = sizeof(struct rte_reorder_buffer) + + (2 * size * sizeof(struct rte_mbuf *)); + + reorder_list = RTE_TAILQ_CAST(rte_reorder_tailq.head, rte_reorder_list); + + /* Check user arguments. */ + if (!rte_is_power_of_2(size)) { + RTE_LOG(ERR, REORDER, "Invalid reorder buffer size" + " - Not a power of 2\n"); + rte_errno = EINVAL; + return NULL; + } + if (name == NULL) { + RTE_LOG(ERR, REORDER, "Invalid reorder buffer name ptr:" + " NULL\n"); + rte_errno = EINVAL; + return NULL; + } + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* guarantee there's no existing */ + TAILQ_FOREACH(te, reorder_list, next) { + b = (struct rte_reorder_buffer *) te->data; + if (strncmp(name, b->name, RTE_REORDER_NAMESIZE) == 0) + break; + } + if (te != NULL) + goto exit; + + /* allocate tailq entry */ + te = rte_zmalloc("REORDER_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, REORDER, "Failed to allocate tailq entry\n"); + rte_errno = ENOMEM; + b = NULL; + goto exit; + } + + /* Allocate memory to store the reorder buffer structure. */ + b = rte_zmalloc_socket("REORDER_BUFFER", bufsize, 0, socket_id); + if (b == NULL) { + RTE_LOG(ERR, REORDER, "Memzone allocation failed\n"); + rte_errno = ENOMEM; + rte_free(te); + } else { + rte_reorder_init(b, bufsize, name, size); + te->data = (void *)b; + TAILQ_INSERT_TAIL(reorder_list, te, next); + } + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return b; +} + +void +rte_reorder_reset(struct rte_reorder_buffer *b) +{ + char name[RTE_REORDER_NAMESIZE]; + + rte_reorder_free_mbufs(b); + snprintf(name, sizeof(name), "%s", b->name); + /* No error checking as current values should be valid */ + rte_reorder_init(b, b->memsize, name, b->order_buf.size); +} + +static void +rte_reorder_free_mbufs(struct rte_reorder_buffer *b) +{ + unsigned i; + + /* Free up the mbufs of order buffer & ready buffer */ + for (i = 0; i < b->order_buf.size; i++) { + if (b->order_buf.entries[i]) + rte_pktmbuf_free(b->order_buf.entries[i]); + if (b->ready_buf.entries[i]) + rte_pktmbuf_free(b->ready_buf.entries[i]); + } +} + +void +rte_reorder_free(struct rte_reorder_buffer *b) +{ + struct rte_reorder_list *reorder_list; + struct rte_tailq_entry *te; + + /* Check user arguments. */ + if (b == NULL) + return; + + reorder_list = RTE_TAILQ_CAST(rte_reorder_tailq.head, rte_reorder_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find our tailq entry */ + TAILQ_FOREACH(te, reorder_list, next) { + if (te->data == (void *) b) + break; + } + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(reorder_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_reorder_free_mbufs(b); + + rte_free(b); + rte_free(te); +} + +struct rte_reorder_buffer * +rte_reorder_find_existing(const char *name) +{ + struct rte_reorder_buffer *b = NULL; + struct rte_tailq_entry *te; + struct rte_reorder_list *reorder_list; + + reorder_list = RTE_TAILQ_CAST(rte_reorder_tailq.head, rte_reorder_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, reorder_list, next) { + b = (struct rte_reorder_buffer *) te->data; + if (strncmp(name, b->name, RTE_REORDER_NAMESIZE) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return b; +} + +static unsigned +rte_reorder_fill_overflow(struct rte_reorder_buffer *b, unsigned n) +{ + /* + * 1. Move all ready entries that fit to the ready_buf + * 2. check if we meet the minimum needed (n). + * 3. If not, then skip any gaps and keep moving. + * 4. If at any point the ready buffer is full, stop + * 5. Return the number of positions the order_buf head has moved + */ + + struct cir_buffer *order_buf = &b->order_buf, + *ready_buf = &b->ready_buf; + + unsigned int order_head_adv = 0; + + /* + * move at least n packets to ready buffer, assuming ready buffer + * has room for those packets. + */ + while (order_head_adv < n && + ((ready_buf->head + 1) & ready_buf->mask) != ready_buf->tail) { + + /* if we are blocked waiting on a packet, skip it */ + if (order_buf->entries[order_buf->head] == NULL) { + order_buf->head = (order_buf->head + 1) & order_buf->mask; + order_head_adv++; + } + + /* Move all ready entries that fit to the ready_buf */ + while (order_buf->entries[order_buf->head] != NULL) { + ready_buf->entries[ready_buf->head] = + order_buf->entries[order_buf->head]; + + order_buf->entries[order_buf->head] = NULL; + order_head_adv++; + + order_buf->head = (order_buf->head + 1) & order_buf->mask; + + if (((ready_buf->head + 1) & ready_buf->mask) == ready_buf->tail) + break; + + ready_buf->head = (ready_buf->head + 1) & ready_buf->mask; + } + } + + b->min_seqn += order_head_adv; + /* Return the number of positions the order_buf head has moved */ + return order_head_adv; +} + +int +rte_reorder_insert(struct rte_reorder_buffer *b, struct rte_mbuf *mbuf) +{ + uint32_t offset, position; + struct cir_buffer *order_buf = &b->order_buf; + + if (!b->is_initialized) { + b->min_seqn = mbuf->seqn; + b->is_initialized = 1; + } + + /* + * calculate the offset from the head pointer we need to go. + * The subtraction takes care of the sequence number wrapping. + * For example (using 16-bit for brevity): + * min_seqn = 0xFFFD + * mbuf_seqn = 0x0010 + * offset = 0x0010 - 0xFFFD = 0x13 + */ + offset = mbuf->seqn - b->min_seqn; + + /* + * action to take depends on offset. + * offset < buffer->size: the mbuf fits within the current window of + * sequence numbers we can reorder. EXPECTED CASE. + * offset > buffer->size: the mbuf is outside the current window. There + * are a number of cases to consider: + * 1. The packet sequence is just outside the window, then we need + * to see about shifting the head pointer and taking any ready + * to return packets out of the ring. If there was a delayed + * or dropped packet preventing drains from shifting the window + * this case will skip over the dropped packet instead, and any + * packets dequeued here will be returned on the next drain call. + * 2. The packet sequence number is vastly outside our window, taken + * here as having offset greater than twice the buffer size. In + * this case, the packet is probably an old or late packet that + * was previously skipped, so just enqueue the packet for + * immediate return on the next drain call, or else return error. + */ + if (offset < b->order_buf.size) { + position = (order_buf->head + offset) & order_buf->mask; + order_buf->entries[position] = mbuf; + } else if (offset < 2 * b->order_buf.size) { + if (rte_reorder_fill_overflow(b, offset + 1 - order_buf->size) + < (offset + 1 - order_buf->size)) { + /* Put in handling for enqueue straight to output */ + rte_errno = ENOSPC; + return -1; + } + offset = mbuf->seqn - b->min_seqn; + position = (order_buf->head + offset) & order_buf->mask; + order_buf->entries[position] = mbuf; + } else { + /* Put in handling for enqueue straight to output */ + rte_errno = ERANGE; + return -1; + } + return 0; +} + +unsigned int +rte_reorder_drain(struct rte_reorder_buffer *b, struct rte_mbuf **mbufs, + unsigned max_mbufs) +{ + unsigned int drain_cnt = 0; + + struct cir_buffer *order_buf = &b->order_buf, + *ready_buf = &b->ready_buf; + + /* Try to fetch requested number of mbufs from ready buffer */ + while ((drain_cnt < max_mbufs) && (ready_buf->tail != ready_buf->head)) { + mbufs[drain_cnt++] = ready_buf->entries[ready_buf->tail]; + ready_buf->tail = (ready_buf->tail + 1) & ready_buf->mask; + } + + /* + * If requested number of buffers not fetched from ready buffer, fetch + * remaining buffers from order buffer + */ + while ((drain_cnt < max_mbufs) && + (order_buf->entries[order_buf->head] != NULL)) { + mbufs[drain_cnt++] = order_buf->entries[order_buf->head]; + order_buf->entries[order_buf->head] = NULL; + b->min_seqn++; + order_buf->head = (order_buf->head + 1) & order_buf->mask; + } + + return drain_cnt; +} diff --git a/src/spdk/dpdk/lib/librte_reorder/rte_reorder.h b/src/spdk/dpdk/lib/librte_reorder/rte_reorder.h new file mode 100644 index 00000000..1bcc2e32 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_reorder/rte_reorder.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_REORDER_H_ +#define _RTE_REORDER_H_ + +/** + * @file + * RTE reorder + * + * Reorder library is a component which is designed to + * provide ordering of out of ordered packets based on + * sequence number present in mbuf. + * + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct rte_reorder_buffer; + +/** + * Create a new reorder buffer instance + * + * Allocate memory and initialize a new reorder buffer in that + * memory, returning the reorder buffer pointer to the user + * + * @param name + * The name to be given to the reorder buffer instance. + * @param socket_id + * The NUMA node on which the memory for the reorder buffer + * instance is to be reserved. + * @param size + * Max number of elements that can be stored in the reorder buffer + * @return + * The initialized reorder buffer instance, or NULL on error + * On error case, rte_errno will be set appropriately: + * - ENOMEM - no appropriate memory area found in which to create memzone + * - EINVAL - invalid parameters + */ +struct rte_reorder_buffer * +rte_reorder_create(const char *name, unsigned socket_id, unsigned int size); + +/** + * Initializes given reorder buffer instance + * + * @param b + * Reorder buffer instance to initialize + * @param bufsize + * Size of the reorder buffer + * @param name + * The name to be given to the reorder buffer + * @param size + * Number of elements that can be stored in reorder buffer + * @return + * The initialized reorder buffer instance, or NULL on error + * On error case, rte_errno will be set appropriately: + * - EINVAL - invalid parameters + */ +struct rte_reorder_buffer * +rte_reorder_init(struct rte_reorder_buffer *b, unsigned int bufsize, + const char *name, unsigned int size); + +/** + * Find an existing reorder buffer instance + * and return a pointer to it. + * + * @param name + * Name of the reorder buffer instacne as passed to rte_reorder_create() + * @return + * Pointer to reorder buffer instance or NULL if object not found with rte_errno + * set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + * reorder instance list + */ +struct rte_reorder_buffer * +rte_reorder_find_existing(const char *name); + +/** + * Reset the given reorder buffer instance with initial values. + * + * @param b + * Reorder buffer instance which has to be reset + */ +void +rte_reorder_reset(struct rte_reorder_buffer *b); + +/** + * Free reorder buffer instance. + * + * @param b + * reorder buffer instance + * @return + * None + */ +void +rte_reorder_free(struct rte_reorder_buffer *b); + +/** + * Insert given mbuf in reorder buffer in its correct position + * + * The given mbuf is to be reordered relative to other mbufs in the system. + * The mbuf must contain a sequence number which is then used to place + * the buffer in the correct position in the reorder buffer. Reordered + * packets can later be taken from the buffer using the rte_reorder_drain() + * API. + * + * @param b + * Reorder buffer where the mbuf has to be inserted. + * @param mbuf + * mbuf of packet that needs to be inserted in reorder buffer. + * @return + * 0 on success + * -1 on error + * On error case, rte_errno will be set appropriately: + * - ENOSPC - Cannot move existing mbufs from reorder buffer to accommodate + * early mbuf, but it can be accommodated by performing drain and then insert. + * - ERANGE - Too early or late mbuf which is vastly out of range of expected + * window should be ignored without any handling. + */ +int +rte_reorder_insert(struct rte_reorder_buffer *b, struct rte_mbuf *mbuf); + +/** + * Fetch reordered buffers + * + * Returns a set of in-order buffers from the reorder buffer structure. Gaps + * may be present in the sequence numbers of the mbuf if packets have been + * delayed too long before reaching the reorder window, or have been previously + * dropped by the system. + * + * @param b + * Reorder buffer instance from which packets are to be drained + * @param mbufs + * array of mbufs where reordered packets will be inserted from reorder buffer + * @param max_mbufs + * the number of elements in the mbufs array. + * @return + * number of mbuf pointers written to mbufs. 0 <= N < max_mbufs. + */ +unsigned int +rte_reorder_drain(struct rte_reorder_buffer *b, struct rte_mbuf **mbufs, + unsigned max_mbufs); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_REORDER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_reorder/rte_reorder_version.map b/src/spdk/dpdk/lib/librte_reorder/rte_reorder_version.map new file mode 100644 index 00000000..0a8a54de --- /dev/null +++ b/src/spdk/dpdk/lib/librte_reorder/rte_reorder_version.map @@ -0,0 +1,13 @@ +DPDK_2.0 { + global: + + rte_reorder_create; + rte_reorder_init; + rte_reorder_find_existing; + rte_reorder_reset; + rte_reorder_free; + rte_reorder_insert; + rte_reorder_drain; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_ring/Makefile b/src/spdk/dpdk/lib/librte_ring/Makefile new file mode 100644 index 00000000..21a36770 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_ring.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_ring_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_RING) := rte_ring.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_RING)-include := rte_ring.h \ + rte_ring_generic.h \ + rte_ring_c11_mem.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_ring/meson.build b/src/spdk/dpdk/lib/librte_ring/meson.build new file mode 100644 index 00000000..ca8a435e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_ring.c') +headers = files('rte_ring.h', + 'rte_ring_c11_mem.h', + 'rte_ring_generic.h') diff --git a/src/spdk/dpdk/lib/librte_ring/rte_ring.c b/src/spdk/dpdk/lib/librte_ring/rte_ring.c new file mode 100644 index 00000000..d215acec --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/rte_ring.c @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2010-2015 Intel Corporation + * Copyright (c) 2007,2008 Kip Macy kmacy@freebsd.org + * All rights reserved. + * Derived from FreeBSD's bufring.h + * Used as BSD-3 Licensed with permission from Kip Macy. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_ring.h" + +TAILQ_HEAD(rte_ring_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_ring_tailq = { + .name = RTE_TAILQ_RING_NAME, +}; +EAL_REGISTER_TAILQ(rte_ring_tailq) + +/* true if x is a power of 2 */ +#define POWEROF2(x) ((((x)-1) & (x)) == 0) + +/* return the size of memory occupied by a ring */ +ssize_t +rte_ring_get_memsize(unsigned count) +{ + ssize_t sz; + + /* count must be a power of 2 */ + if ((!POWEROF2(count)) || (count > RTE_RING_SZ_MASK )) { + RTE_LOG(ERR, RING, + "Requested size is invalid, must be power of 2, and " + "do not exceed the size limit %u\n", RTE_RING_SZ_MASK); + return -EINVAL; + } + + sz = sizeof(struct rte_ring) + count * sizeof(void *); + sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE); + return sz; +} + +int +rte_ring_init(struct rte_ring *r, const char *name, unsigned count, + unsigned flags) +{ + int ret; + + /* compilation-time checks */ + RTE_BUILD_BUG_ON((sizeof(struct rte_ring) & + RTE_CACHE_LINE_MASK) != 0); + RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) & + RTE_CACHE_LINE_MASK) != 0); + RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) & + RTE_CACHE_LINE_MASK) != 0); + + /* init the ring structure */ + memset(r, 0, sizeof(*r)); + ret = snprintf(r->name, sizeof(r->name), "%s", name); + if (ret < 0 || ret >= (int)sizeof(r->name)) + return -ENAMETOOLONG; + r->flags = flags; + r->prod.single = (flags & RING_F_SP_ENQ) ? __IS_SP : __IS_MP; + r->cons.single = (flags & RING_F_SC_DEQ) ? __IS_SC : __IS_MC; + + if (flags & RING_F_EXACT_SZ) { + r->size = rte_align32pow2(count + 1); + r->mask = r->size - 1; + r->capacity = count; + } else { + if ((!POWEROF2(count)) || (count > RTE_RING_SZ_MASK)) { + RTE_LOG(ERR, RING, + "Requested size is invalid, must be power of 2, and not exceed the size limit %u\n", + RTE_RING_SZ_MASK); + return -EINVAL; + } + r->size = count; + r->mask = count - 1; + r->capacity = r->mask; + } + r->prod.head = r->cons.head = 0; + r->prod.tail = r->cons.tail = 0; + + return 0; +} + +/* create the ring */ +struct rte_ring * +rte_ring_create(const char *name, unsigned count, int socket_id, + unsigned flags) +{ + char mz_name[RTE_MEMZONE_NAMESIZE]; + struct rte_ring *r; + struct rte_tailq_entry *te; + const struct rte_memzone *mz; + ssize_t ring_size; + int mz_flags = 0; + struct rte_ring_list* ring_list = NULL; + const unsigned int requested_count = count; + int ret; + + ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list); + + /* for an exact size ring, round up from count to a power of two */ + if (flags & RING_F_EXACT_SZ) + count = rte_align32pow2(count + 1); + + ring_size = rte_ring_get_memsize(count); + if (ring_size < 0) { + rte_errno = ring_size; + return NULL; + } + + ret = snprintf(mz_name, sizeof(mz_name), "%s%s", + RTE_RING_MZ_PREFIX, name); + if (ret < 0 || ret >= (int)sizeof(mz_name)) { + rte_errno = ENAMETOOLONG; + return NULL; + } + + te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, RING, "Cannot reserve memory for tailq\n"); + rte_errno = ENOMEM; + return NULL; + } + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* reserve a memory zone for this ring. If we can't get rte_config or + * we are secondary process, the memzone_reserve function will set + * rte_errno for us appropriately - hence no check in this this function */ + mz = rte_memzone_reserve_aligned(mz_name, ring_size, socket_id, + mz_flags, __alignof__(*r)); + if (mz != NULL) { + r = mz->addr; + /* no need to check return value here, we already checked the + * arguments above */ + rte_ring_init(r, name, requested_count, flags); + + te->data = (void *) r; + r->memzone = mz; + + TAILQ_INSERT_TAIL(ring_list, te, next); + } else { + r = NULL; + RTE_LOG(ERR, RING, "Cannot reserve memory\n"); + rte_free(te); + } + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + return r; +} + +/* free the ring */ +void +rte_ring_free(struct rte_ring *r) +{ + struct rte_ring_list *ring_list = NULL; + struct rte_tailq_entry *te; + + if (r == NULL) + return; + + /* + * Ring was not created with rte_ring_create, + * therefore, there is no memzone to free. + */ + if (r->memzone == NULL) { + RTE_LOG(ERR, RING, "Cannot free ring (not created with rte_ring_create()"); + return; + } + + if (rte_memzone_free(r->memzone) != 0) { + RTE_LOG(ERR, RING, "Cannot free memory\n"); + return; + } + + ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list); + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find out tailq entry */ + TAILQ_FOREACH(te, ring_list, next) { + if (te->data == (void *) r) + break; + } + + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(ring_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(te); +} + +/* dump the status of the ring on the console */ +void +rte_ring_dump(FILE *f, const struct rte_ring *r) +{ + fprintf(f, "ring <%s>@%p\n", r->name, r); + fprintf(f, " flags=%x\n", r->flags); + fprintf(f, " size=%"PRIu32"\n", r->size); + fprintf(f, " capacity=%"PRIu32"\n", r->capacity); + fprintf(f, " ct=%"PRIu32"\n", r->cons.tail); + fprintf(f, " ch=%"PRIu32"\n", r->cons.head); + fprintf(f, " pt=%"PRIu32"\n", r->prod.tail); + fprintf(f, " ph=%"PRIu32"\n", r->prod.head); + fprintf(f, " used=%u\n", rte_ring_count(r)); + fprintf(f, " avail=%u\n", rte_ring_free_count(r)); +} + +/* dump the status of all rings on the console */ +void +rte_ring_list_dump(FILE *f) +{ + const struct rte_tailq_entry *te; + struct rte_ring_list *ring_list; + + ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + + TAILQ_FOREACH(te, ring_list, next) { + rte_ring_dump(f, (struct rte_ring *) te->data); + } + + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); +} + +/* search a ring from its name */ +struct rte_ring * +rte_ring_lookup(const char *name) +{ + struct rte_tailq_entry *te; + struct rte_ring *r = NULL; + struct rte_ring_list *ring_list; + + ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + + TAILQ_FOREACH(te, ring_list, next) { + r = (struct rte_ring *) te->data; + if (strncmp(name, r->name, RTE_RING_NAMESIZE) == 0) + break; + } + + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + + return r; +} diff --git a/src/spdk/dpdk/lib/librte_ring/rte_ring.h b/src/spdk/dpdk/lib/librte_ring/rte_ring.h new file mode 100644 index 00000000..7a731d07 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/rte_ring.h @@ -0,0 +1,945 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2010-2017 Intel Corporation + * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org + * All rights reserved. + * Derived from FreeBSD's bufring.h + * Used as BSD-3 Licensed with permission from Kip Macy. + */ + +#ifndef _RTE_RING_H_ +#define _RTE_RING_H_ + +/** + * @file + * RTE Ring + * + * The Ring Manager is a fixed-size queue, implemented as a table of + * pointers. Head and tail pointers are modified atomically, allowing + * concurrent access to it. It has the following features: + * + * - FIFO (First In First Out) + * - Maximum size is fixed; the pointers are stored in a table. + * - Lockless implementation. + * - Multi- or single-consumer dequeue. + * - Multi- or single-producer enqueue. + * - Bulk dequeue. + * - Bulk enqueue. + * + * Note: the ring implementation is not preemptible. Refer to Programmer's + * guide/Environment Abstraction Layer/Multiple pthread/Known Issues/rte_ring + * for more information. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RTE_TAILQ_RING_NAME "RTE_RING" + +enum rte_ring_queue_behavior { + RTE_RING_QUEUE_FIXED = 0, /* Enq/Deq a fixed number of items from a ring */ + RTE_RING_QUEUE_VARIABLE /* Enq/Deq as many items as possible from ring */ +}; + +#define RTE_RING_MZ_PREFIX "RG_" +/**< The maximum length of a ring name. */ +#define RTE_RING_NAMESIZE (RTE_MEMZONE_NAMESIZE - \ + sizeof(RTE_RING_MZ_PREFIX) + 1) + +struct rte_memzone; /* forward declaration, so as not to require memzone.h */ + +/* structure to hold a pair of head/tail values and other metadata */ +struct rte_ring_headtail { + volatile uint32_t head; /**< Prod/consumer head. */ + volatile uint32_t tail; /**< Prod/consumer tail. */ + uint32_t single; /**< True if single prod/cons */ +}; + +/** + * An RTE ring structure. + * + * The producer and the consumer have a head and a tail index. The particularity + * of these index is that they are not between 0 and size(ring). These indexes + * are between 0 and 2^32, and we mask their value when we access the ring[] + * field. Thanks to this assumption, we can do subtractions between 2 index + * values in a modulo-32bit base: that's why the overflow of the indexes is not + * a problem. + */ +struct rte_ring { + /* + * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to ABI + * compatibility requirements, it could be changed to RTE_RING_NAMESIZE + * next time the ABI changes + */ + char name[RTE_MEMZONE_NAMESIZE] __rte_cache_aligned; /**< Name of the ring. */ + int flags; /**< Flags supplied at creation. */ + const struct rte_memzone *memzone; + /**< Memzone, if any, containing the rte_ring */ + uint32_t size; /**< Size of ring. */ + uint32_t mask; /**< Mask (size-1) of ring. */ + uint32_t capacity; /**< Usable size of ring */ + + char pad0 __rte_cache_aligned; /**< empty cache line */ + + /** Ring producer status. */ + struct rte_ring_headtail prod __rte_cache_aligned; + char pad1 __rte_cache_aligned; /**< empty cache line */ + + /** Ring consumer status. */ + struct rte_ring_headtail cons __rte_cache_aligned; + char pad2 __rte_cache_aligned; /**< empty cache line */ +}; + +#define RING_F_SP_ENQ 0x0001 /**< The default enqueue is "single-producer". */ +#define RING_F_SC_DEQ 0x0002 /**< The default dequeue is "single-consumer". */ +/** + * Ring is to hold exactly requested number of entries. + * Without this flag set, the ring size requested must be a power of 2, and the + * usable space will be that size - 1. With the flag, the requested size will + * be rounded up to the next power of two, but the usable space will be exactly + * that requested. Worst case, if a power-of-2 size is requested, half the + * ring space will be wasted. + */ +#define RING_F_EXACT_SZ 0x0004 +#define RTE_RING_SZ_MASK (0x7fffffffU) /**< Ring size mask */ + +/* @internal defines for passing to the enqueue dequeue worker functions */ +#define __IS_SP 1 +#define __IS_MP 0 +#define __IS_SC 1 +#define __IS_MC 0 + +/** + * Calculate the memory size needed for a ring + * + * This function returns the number of bytes needed for a ring, given + * the number of elements in it. This value is the sum of the size of + * the structure rte_ring and the size of the memory needed by the + * objects pointers. The value is aligned to a cache line size. + * + * @param count + * The number of elements in the ring (must be a power of 2). + * @return + * - The memory size needed for the ring on success. + * - -EINVAL if count is not a power of 2. + */ +ssize_t rte_ring_get_memsize(unsigned count); + +/** + * Initialize a ring structure. + * + * Initialize a ring structure in memory pointed by "r". The size of the + * memory area must be large enough to store the ring structure and the + * object table. It is advised to use rte_ring_get_memsize() to get the + * appropriate size. + * + * The ring size is set to *count*, which must be a power of two. Water + * marking is disabled by default. The real usable ring size is + * *count-1* instead of *count* to differentiate a free ring from an + * empty ring. + * + * The ring is not added in RTE_TAILQ_RING global list. Indeed, the + * memory given by the caller may not be shareable among dpdk + * processes. + * + * @param r + * The pointer to the ring structure followed by the objects table. + * @param name + * The name of the ring. + * @param count + * The number of elements in the ring (must be a power of 2). + * @param flags + * An OR of the following: + * - RING_F_SP_ENQ: If this flag is set, the default behavior when + * using ``rte_ring_enqueue()`` or ``rte_ring_enqueue_bulk()`` + * is "single-producer". Otherwise, it is "multi-producers". + * - RING_F_SC_DEQ: If this flag is set, the default behavior when + * using ``rte_ring_dequeue()`` or ``rte_ring_dequeue_bulk()`` + * is "single-consumer". Otherwise, it is "multi-consumers". + * @return + * 0 on success, or a negative value on error. + */ +int rte_ring_init(struct rte_ring *r, const char *name, unsigned count, + unsigned flags); + +/** + * Create a new ring named *name* in memory. + * + * This function uses ``memzone_reserve()`` to allocate memory. Then it + * calls rte_ring_init() to initialize an empty ring. + * + * The new ring size is set to *count*, which must be a power of + * two. Water marking is disabled by default. The real usable ring size + * is *count-1* instead of *count* to differentiate a free ring from an + * empty ring. + * + * The ring is added in RTE_TAILQ_RING list. + * + * @param name + * The name of the ring. + * @param count + * The size of the ring (must be a power of 2). + * @param socket_id + * The *socket_id* argument is the socket identifier in case of + * NUMA. The value can be *SOCKET_ID_ANY* if there is no NUMA + * constraint for the reserved zone. + * @param flags + * An OR of the following: + * - RING_F_SP_ENQ: If this flag is set, the default behavior when + * using ``rte_ring_enqueue()`` or ``rte_ring_enqueue_bulk()`` + * is "single-producer". Otherwise, it is "multi-producers". + * - RING_F_SC_DEQ: If this flag is set, the default behavior when + * using ``rte_ring_dequeue()`` or ``rte_ring_dequeue_bulk()`` + * is "single-consumer". Otherwise, it is "multi-consumers". + * @return + * On success, the pointer to the new allocated ring. NULL on error with + * rte_errno set appropriately. Possible errno values include: + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure + * - E_RTE_SECONDARY - function was called from a secondary process instance + * - EINVAL - count provided is not a power of 2 + * - ENOSPC - the maximum number of memzones has already been allocated + * - EEXIST - a memzone with the same name already exists + * - ENOMEM - no appropriate memory area found in which to create memzone + */ +struct rte_ring *rte_ring_create(const char *name, unsigned count, + int socket_id, unsigned flags); +/** + * De-allocate all memory used by the ring. + * + * @param r + * Ring to free + */ +void rte_ring_free(struct rte_ring *r); + +/** + * Dump the status of the ring to a file. + * + * @param f + * A pointer to a file for output + * @param r + * A pointer to the ring structure. + */ +void rte_ring_dump(FILE *f, const struct rte_ring *r); + +/* the actual enqueue of pointers on the ring. + * Placed here since identical code needed in both + * single and multi producer enqueue functions */ +#define ENQUEUE_PTRS(r, ring_start, prod_head, obj_table, n, obj_type) do { \ + unsigned int i; \ + const uint32_t size = (r)->size; \ + uint32_t idx = prod_head & (r)->mask; \ + obj_type *ring = (obj_type *)ring_start; \ + if (likely(idx + n < size)) { \ + for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \ + ring[idx] = obj_table[i]; \ + ring[idx+1] = obj_table[i+1]; \ + ring[idx+2] = obj_table[i+2]; \ + ring[idx+3] = obj_table[i+3]; \ + } \ + switch (n & 0x3) { \ + case 3: \ + ring[idx++] = obj_table[i++]; /* fallthrough */ \ + case 2: \ + ring[idx++] = obj_table[i++]; /* fallthrough */ \ + case 1: \ + ring[idx++] = obj_table[i++]; \ + } \ + } else { \ + for (i = 0; idx < size; i++, idx++)\ + ring[idx] = obj_table[i]; \ + for (idx = 0; i < n; i++, idx++) \ + ring[idx] = obj_table[i]; \ + } \ +} while (0) + +/* the actual copy of pointers on the ring to obj_table. + * Placed here since identical code needed in both + * single and multi consumer dequeue functions */ +#define DEQUEUE_PTRS(r, ring_start, cons_head, obj_table, n, obj_type) do { \ + unsigned int i; \ + uint32_t idx = cons_head & (r)->mask; \ + const uint32_t size = (r)->size; \ + obj_type *ring = (obj_type *)ring_start; \ + if (likely(idx + n < size)) { \ + for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\ + obj_table[i] = ring[idx]; \ + obj_table[i+1] = ring[idx+1]; \ + obj_table[i+2] = ring[idx+2]; \ + obj_table[i+3] = ring[idx+3]; \ + } \ + switch (n & 0x3) { \ + case 3: \ + obj_table[i++] = ring[idx++]; /* fallthrough */ \ + case 2: \ + obj_table[i++] = ring[idx++]; /* fallthrough */ \ + case 1: \ + obj_table[i++] = ring[idx++]; \ + } \ + } else { \ + for (i = 0; idx < size; i++, idx++) \ + obj_table[i] = ring[idx]; \ + for (idx = 0; i < n; i++, idx++) \ + obj_table[i] = ring[idx]; \ + } \ +} while (0) + +/* Between load and load. there might be cpu reorder in weak model + * (powerpc/arm). + * There are 2 choices for the users + * 1.use rmb() memory barrier + * 2.use one-direcion load_acquire/store_release barrier,defined by + * CONFIG_RTE_RING_USE_C11_MEM_MODEL=y + * It depends on performance test results. + * By default, move common functions to rte_ring_generic.h + */ +#ifdef RTE_RING_USE_C11_MEM_MODEL +#include "rte_ring_c11_mem.h" +#else +#include "rte_ring_generic.h" +#endif + +/** + * @internal Enqueue several objects on the ring + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param behavior + * RTE_RING_QUEUE_FIXED: Enqueue a fixed number of items from a ring + * RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring + * @param is_sp + * Indicates whether to use single producer or multi-producer head update + * @param free_space + * returns the amount of space after the enqueue operation has finished + * @return + * Actual number of objects enqueued. + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. + */ +static __rte_always_inline unsigned int +__rte_ring_do_enqueue(struct rte_ring *r, void * const *obj_table, + unsigned int n, enum rte_ring_queue_behavior behavior, + unsigned int is_sp, unsigned int *free_space) +{ + uint32_t prod_head, prod_next; + uint32_t free_entries; + + n = __rte_ring_move_prod_head(r, is_sp, n, behavior, + &prod_head, &prod_next, &free_entries); + if (n == 0) + goto end; + + ENQUEUE_PTRS(r, &r[1], prod_head, obj_table, n, void *); + + update_tail(&r->prod, prod_head, prod_next, is_sp, 1); +end: + if (free_space != NULL) + *free_space = free_entries - n; + return n; +} + +/** + * @internal Dequeue several objects from the ring + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to pull from the ring. + * @param behavior + * RTE_RING_QUEUE_FIXED: Dequeue a fixed number of items from a ring + * RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring + * @param is_sc + * Indicates whether to use single consumer or multi-consumer head update + * @param available + * returns the number of remaining ring entries after the dequeue has finished + * @return + * - Actual number of objects dequeued. + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. + */ +static __rte_always_inline unsigned int +__rte_ring_do_dequeue(struct rte_ring *r, void **obj_table, + unsigned int n, enum rte_ring_queue_behavior behavior, + unsigned int is_sc, unsigned int *available) +{ + uint32_t cons_head, cons_next; + uint32_t entries; + + n = __rte_ring_move_cons_head(r, (int)is_sc, n, behavior, + &cons_head, &cons_next, &entries); + if (n == 0) + goto end; + + DEQUEUE_PTRS(r, &r[1], cons_head, obj_table, n, void *); + + update_tail(&r->cons, cons_head, cons_next, is_sc, 0); + +end: + if (available != NULL) + *available = entries - n; + return n; +} + +/** + * Enqueue several objects on the ring (multi-producers safe). + * + * This function uses a "compare and set" instruction to move the + * producer index atomically. + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param free_space + * if non-NULL, returns the amount of space in the ring after the + * enqueue operation has finished. + * @return + * The number of objects enqueued, either 0 or n + */ +static __rte_always_inline unsigned int +rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table, + unsigned int n, unsigned int *free_space) +{ + return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED, + __IS_MP, free_space); +} + +/** + * Enqueue several objects on a ring (NOT multi-producers safe). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param free_space + * if non-NULL, returns the amount of space in the ring after the + * enqueue operation has finished. + * @return + * The number of objects enqueued, either 0 or n + */ +static __rte_always_inline unsigned int +rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table, + unsigned int n, unsigned int *free_space) +{ + return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED, + __IS_SP, free_space); +} + +/** + * Enqueue several objects on a ring. + * + * This function calls the multi-producer or the single-producer + * version depending on the default behavior that was specified at + * ring creation time (see flags). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param free_space + * if non-NULL, returns the amount of space in the ring after the + * enqueue operation has finished. + * @return + * The number of objects enqueued, either 0 or n + */ +static __rte_always_inline unsigned int +rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table, + unsigned int n, unsigned int *free_space) +{ + return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED, + r->prod.single, free_space); +} + +/** + * Enqueue one object on a ring (multi-producers safe). + * + * This function uses a "compare and set" instruction to move the + * producer index atomically. + * + * @param r + * A pointer to the ring structure. + * @param obj + * A pointer to the object to be added. + * @return + * - 0: Success; objects enqueued. + * - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued. + */ +static __rte_always_inline int +rte_ring_mp_enqueue(struct rte_ring *r, void *obj) +{ + return rte_ring_mp_enqueue_bulk(r, &obj, 1, NULL) ? 0 : -ENOBUFS; +} + +/** + * Enqueue one object on a ring (NOT multi-producers safe). + * + * @param r + * A pointer to the ring structure. + * @param obj + * A pointer to the object to be added. + * @return + * - 0: Success; objects enqueued. + * - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued. + */ +static __rte_always_inline int +rte_ring_sp_enqueue(struct rte_ring *r, void *obj) +{ + return rte_ring_sp_enqueue_bulk(r, &obj, 1, NULL) ? 0 : -ENOBUFS; +} + +/** + * Enqueue one object on a ring. + * + * This function calls the multi-producer or the single-producer + * version, depending on the default behaviour that was specified at + * ring creation time (see flags). + * + * @param r + * A pointer to the ring structure. + * @param obj + * A pointer to the object to be added. + * @return + * - 0: Success; objects enqueued. + * - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued. + */ +static __rte_always_inline int +rte_ring_enqueue(struct rte_ring *r, void *obj) +{ + return rte_ring_enqueue_bulk(r, &obj, 1, NULL) ? 0 : -ENOBUFS; +} + +/** + * Dequeue several objects from a ring (multi-consumers safe). + * + * This function uses a "compare and set" instruction to move the + * consumer index atomically. + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to dequeue from the ring to the obj_table. + * @param available + * If non-NULL, returns the number of remaining ring entries after the + * dequeue has finished. + * @return + * The number of objects dequeued, either 0 or n + */ +static __rte_always_inline unsigned int +rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table, + unsigned int n, unsigned int *available) +{ + return __rte_ring_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED, + __IS_MC, available); +} + +/** + * Dequeue several objects from a ring (NOT multi-consumers safe). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to dequeue from the ring to the obj_table, + * must be strictly positive. + * @param available + * If non-NULL, returns the number of remaining ring entries after the + * dequeue has finished. + * @return + * The number of objects dequeued, either 0 or n + */ +static __rte_always_inline unsigned int +rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table, + unsigned int n, unsigned int *available) +{ + return __rte_ring_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED, + __IS_SC, available); +} + +/** + * Dequeue several objects from a ring. + * + * This function calls the multi-consumers or the single-consumer + * version, depending on the default behaviour that was specified at + * ring creation time (see flags). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to dequeue from the ring to the obj_table. + * @param available + * If non-NULL, returns the number of remaining ring entries after the + * dequeue has finished. + * @return + * The number of objects dequeued, either 0 or n + */ +static __rte_always_inline unsigned int +rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n, + unsigned int *available) +{ + return __rte_ring_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED, + r->cons.single, available); +} + +/** + * Dequeue one object from a ring (multi-consumers safe). + * + * This function uses a "compare and set" instruction to move the + * consumer index atomically. + * + * @param r + * A pointer to the ring structure. + * @param obj_p + * A pointer to a void * pointer (object) that will be filled. + * @return + * - 0: Success; objects dequeued. + * - -ENOENT: Not enough entries in the ring to dequeue; no object is + * dequeued. + */ +static __rte_always_inline int +rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p) +{ + return rte_ring_mc_dequeue_bulk(r, obj_p, 1, NULL) ? 0 : -ENOENT; +} + +/** + * Dequeue one object from a ring (NOT multi-consumers safe). + * + * @param r + * A pointer to the ring structure. + * @param obj_p + * A pointer to a void * pointer (object) that will be filled. + * @return + * - 0: Success; objects dequeued. + * - -ENOENT: Not enough entries in the ring to dequeue, no object is + * dequeued. + */ +static __rte_always_inline int +rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p) +{ + return rte_ring_sc_dequeue_bulk(r, obj_p, 1, NULL) ? 0 : -ENOENT; +} + +/** + * Dequeue one object from a ring. + * + * This function calls the multi-consumers or the single-consumer + * version depending on the default behaviour that was specified at + * ring creation time (see flags). + * + * @param r + * A pointer to the ring structure. + * @param obj_p + * A pointer to a void * pointer (object) that will be filled. + * @return + * - 0: Success, objects dequeued. + * - -ENOENT: Not enough entries in the ring to dequeue, no object is + * dequeued. + */ +static __rte_always_inline int +rte_ring_dequeue(struct rte_ring *r, void **obj_p) +{ + return rte_ring_dequeue_bulk(r, obj_p, 1, NULL) ? 0 : -ENOENT; +} + +/** + * Return the number of entries in a ring. + * + * @param r + * A pointer to the ring structure. + * @return + * The number of entries in the ring. + */ +static inline unsigned +rte_ring_count(const struct rte_ring *r) +{ + uint32_t prod_tail = r->prod.tail; + uint32_t cons_tail = r->cons.tail; + uint32_t count = (prod_tail - cons_tail) & r->mask; + return (count > r->capacity) ? r->capacity : count; +} + +/** + * Return the number of free entries in a ring. + * + * @param r + * A pointer to the ring structure. + * @return + * The number of free entries in the ring. + */ +static inline unsigned +rte_ring_free_count(const struct rte_ring *r) +{ + return r->capacity - rte_ring_count(r); +} + +/** + * Test if a ring is full. + * + * @param r + * A pointer to the ring structure. + * @return + * - 1: The ring is full. + * - 0: The ring is not full. + */ +static inline int +rte_ring_full(const struct rte_ring *r) +{ + return rte_ring_free_count(r) == 0; +} + +/** + * Test if a ring is empty. + * + * @param r + * A pointer to the ring structure. + * @return + * - 1: The ring is empty. + * - 0: The ring is not empty. + */ +static inline int +rte_ring_empty(const struct rte_ring *r) +{ + return rte_ring_count(r) == 0; +} + +/** + * Return the size of the ring. + * + * @param r + * A pointer to the ring structure. + * @return + * The size of the data store used by the ring. + * NOTE: this is not the same as the usable space in the ring. To query that + * use ``rte_ring_get_capacity()``. + */ +static inline unsigned int +rte_ring_get_size(const struct rte_ring *r) +{ + return r->size; +} + +/** + * Return the number of elements which can be stored in the ring. + * + * @param r + * A pointer to the ring structure. + * @return + * The usable size of the ring. + */ +static inline unsigned int +rte_ring_get_capacity(const struct rte_ring *r) +{ + return r->capacity; +} + +/** + * Dump the status of all rings on the console + * + * @param f + * A pointer to a file for output + */ +void rte_ring_list_dump(FILE *f); + +/** + * Search a ring from its name + * + * @param name + * The name of the ring. + * @return + * The pointer to the ring matching the name, or NULL if not found, + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - required entry not available to return. + */ +struct rte_ring *rte_ring_lookup(const char *name); + +/** + * Enqueue several objects on the ring (multi-producers safe). + * + * This function uses a "compare and set" instruction to move the + * producer index atomically. + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param free_space + * if non-NULL, returns the amount of space in the ring after the + * enqueue operation has finished. + * @return + * - n: Actual number of objects enqueued. + */ +static __rte_always_inline unsigned +rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table, + unsigned int n, unsigned int *free_space) +{ + return __rte_ring_do_enqueue(r, obj_table, n, + RTE_RING_QUEUE_VARIABLE, __IS_MP, free_space); +} + +/** + * Enqueue several objects on a ring (NOT multi-producers safe). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param free_space + * if non-NULL, returns the amount of space in the ring after the + * enqueue operation has finished. + * @return + * - n: Actual number of objects enqueued. + */ +static __rte_always_inline unsigned +rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table, + unsigned int n, unsigned int *free_space) +{ + return __rte_ring_do_enqueue(r, obj_table, n, + RTE_RING_QUEUE_VARIABLE, __IS_SP, free_space); +} + +/** + * Enqueue several objects on a ring. + * + * This function calls the multi-producer or the single-producer + * version depending on the default behavior that was specified at + * ring creation time (see flags). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects). + * @param n + * The number of objects to add in the ring from the obj_table. + * @param free_space + * if non-NULL, returns the amount of space in the ring after the + * enqueue operation has finished. + * @return + * - n: Actual number of objects enqueued. + */ +static __rte_always_inline unsigned +rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table, + unsigned int n, unsigned int *free_space) +{ + return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_VARIABLE, + r->prod.single, free_space); +} + +/** + * Dequeue several objects from a ring (multi-consumers safe). When the request + * objects are more than the available objects, only dequeue the actual number + * of objects + * + * This function uses a "compare and set" instruction to move the + * consumer index atomically. + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to dequeue from the ring to the obj_table. + * @param available + * If non-NULL, returns the number of remaining ring entries after the + * dequeue has finished. + * @return + * - n: Actual number of objects dequeued, 0 if ring is empty + */ +static __rte_always_inline unsigned +rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table, + unsigned int n, unsigned int *available) +{ + return __rte_ring_do_dequeue(r, obj_table, n, + RTE_RING_QUEUE_VARIABLE, __IS_MC, available); +} + +/** + * Dequeue several objects from a ring (NOT multi-consumers safe).When the + * request objects are more than the available objects, only dequeue the + * actual number of objects + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to dequeue from the ring to the obj_table. + * @param available + * If non-NULL, returns the number of remaining ring entries after the + * dequeue has finished. + * @return + * - n: Actual number of objects dequeued, 0 if ring is empty + */ +static __rte_always_inline unsigned +rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table, + unsigned int n, unsigned int *available) +{ + return __rte_ring_do_dequeue(r, obj_table, n, + RTE_RING_QUEUE_VARIABLE, __IS_SC, available); +} + +/** + * Dequeue multiple objects from a ring up to a maximum number. + * + * This function calls the multi-consumers or the single-consumer + * version, depending on the default behaviour that was specified at + * ring creation time (see flags). + * + * @param r + * A pointer to the ring structure. + * @param obj_table + * A pointer to a table of void * pointers (objects) that will be filled. + * @param n + * The number of objects to dequeue from the ring to the obj_table. + * @param available + * If non-NULL, returns the number of remaining ring entries after the + * dequeue has finished. + * @return + * - Number of objects dequeued + */ +static __rte_always_inline unsigned +rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table, + unsigned int n, unsigned int *available) +{ + return __rte_ring_do_dequeue(r, obj_table, n, + RTE_RING_QUEUE_VARIABLE, + r->cons.single, available); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_RING_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ring/rte_ring_c11_mem.h b/src/spdk/dpdk/lib/librte_ring/rte_ring_c11_mem.h new file mode 100644 index 00000000..94df3c4a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/rte_ring_c11_mem.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2017,2018 HXT-semitech Corporation. + * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org + * All rights reserved. + * Derived from FreeBSD's bufring.h + * Used as BSD-3 Licensed with permission from Kip Macy. + */ + +#ifndef _RTE_RING_C11_MEM_H_ +#define _RTE_RING_C11_MEM_H_ + +static __rte_always_inline void +update_tail(struct rte_ring_headtail *ht, uint32_t old_val, uint32_t new_val, + uint32_t single, uint32_t enqueue) +{ + RTE_SET_USED(enqueue); + + /* + * If there are other enqueues/dequeues in progress that preceded us, + * we need to wait for them to complete + */ + if (!single) + while (unlikely(ht->tail != old_val)) + rte_pause(); + + __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE); +} + +/** + * @internal This function updates the producer head for enqueue + * + * @param r + * A pointer to the ring structure + * @param is_sp + * Indicates whether multi-producer path is needed or not + * @param n + * The number of elements we will want to enqueue, i.e. how far should the + * head be moved + * @param behavior + * RTE_RING_QUEUE_FIXED: Enqueue a fixed number of items from a ring + * RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring + * @param old_head + * Returns head value as it was before the move, i.e. where enqueue starts + * @param new_head + * Returns the current/new head value i.e. where enqueue finishes + * @param free_entries + * Returns the amount of free space in the ring BEFORE head was moved + * @return + * Actual number of objects enqueued. + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. + */ +static __rte_always_inline unsigned int +__rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp, + unsigned int n, enum rte_ring_queue_behavior behavior, + uint32_t *old_head, uint32_t *new_head, + uint32_t *free_entries) +{ + const uint32_t capacity = r->capacity; + unsigned int max = n; + int success; + + do { + /* Reset n to the initial burst count */ + n = max; + + *old_head = __atomic_load_n(&r->prod.head, + __ATOMIC_ACQUIRE); + + /* + * The subtraction is done between two unsigned 32bits value + * (the result is always modulo 32 bits even if we have + * *old_head > cons_tail). So 'free_entries' is always between 0 + * and capacity (which is < size). + */ + *free_entries = (capacity + r->cons.tail - *old_head); + + /* check that we have enough room in ring */ + if (unlikely(n > *free_entries)) + n = (behavior == RTE_RING_QUEUE_FIXED) ? + 0 : *free_entries; + + if (n == 0) + return 0; + + *new_head = *old_head + n; + if (is_sp) + r->prod.head = *new_head, success = 1; + else + success = __atomic_compare_exchange_n(&r->prod.head, + old_head, *new_head, + 0, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED); + } while (unlikely(success == 0)); + return n; +} + +/** + * @internal This function updates the consumer head for dequeue + * + * @param r + * A pointer to the ring structure + * @param is_sc + * Indicates whether multi-consumer path is needed or not + * @param n + * The number of elements we will want to enqueue, i.e. how far should the + * head be moved + * @param behavior + * RTE_RING_QUEUE_FIXED: Dequeue a fixed number of items from a ring + * RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring + * @param old_head + * Returns head value as it was before the move, i.e. where dequeue starts + * @param new_head + * Returns the current/new head value i.e. where dequeue finishes + * @param entries + * Returns the number of entries in the ring BEFORE head was moved + * @return + * - Actual number of objects dequeued. + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. + */ +static __rte_always_inline unsigned int +__rte_ring_move_cons_head(struct rte_ring *r, int is_sc, + unsigned int n, enum rte_ring_queue_behavior behavior, + uint32_t *old_head, uint32_t *new_head, + uint32_t *entries) +{ + unsigned int max = n; + int success; + + /* move cons.head atomically */ + do { + /* Restore n as it may change every loop */ + n = max; + *old_head = __atomic_load_n(&r->cons.head, + __ATOMIC_ACQUIRE); + + /* The subtraction is done between two unsigned 32bits value + * (the result is always modulo 32 bits even if we have + * cons_head > prod_tail). So 'entries' is always between 0 + * and size(ring)-1. + */ + *entries = (r->prod.tail - *old_head); + + /* Set the actual entries for dequeue */ + if (n > *entries) + n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries; + + if (unlikely(n == 0)) + return 0; + + *new_head = *old_head + n; + if (is_sc) + r->cons.head = *new_head, success = 1; + else + success = __atomic_compare_exchange_n(&r->cons.head, + old_head, *new_head, + 0, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED); + } while (unlikely(success == 0)); + return n; +} + +#endif /* _RTE_RING_C11_MEM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ring/rte_ring_generic.h b/src/spdk/dpdk/lib/librte_ring/rte_ring_generic.h new file mode 100644 index 00000000..ea7dbe5b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/rte_ring_generic.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2010-2017 Intel Corporation + * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org + * All rights reserved. + * Derived from FreeBSD's bufring.h + * Used as BSD-3 Licensed with permission from Kip Macy. + */ + +#ifndef _RTE_RING_GENERIC_H_ +#define _RTE_RING_GENERIC_H_ + +static __rte_always_inline void +update_tail(struct rte_ring_headtail *ht, uint32_t old_val, uint32_t new_val, + uint32_t single, uint32_t enqueue) +{ + if (enqueue) + rte_smp_wmb(); + else + rte_smp_rmb(); + /* + * If there are other enqueues/dequeues in progress that preceded us, + * we need to wait for them to complete + */ + if (!single) + while (unlikely(ht->tail != old_val)) + rte_pause(); + + ht->tail = new_val; +} + +/** + * @internal This function updates the producer head for enqueue + * + * @param r + * A pointer to the ring structure + * @param is_sp + * Indicates whether multi-producer path is needed or not + * @param n + * The number of elements we will want to enqueue, i.e. how far should the + * head be moved + * @param behavior + * RTE_RING_QUEUE_FIXED: Enqueue a fixed number of items from a ring + * RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring + * @param old_head + * Returns head value as it was before the move, i.e. where enqueue starts + * @param new_head + * Returns the current/new head value i.e. where enqueue finishes + * @param free_entries + * Returns the amount of free space in the ring BEFORE head was moved + * @return + * Actual number of objects enqueued. + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. + */ +static __rte_always_inline unsigned int +__rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp, + unsigned int n, enum rte_ring_queue_behavior behavior, + uint32_t *old_head, uint32_t *new_head, + uint32_t *free_entries) +{ + const uint32_t capacity = r->capacity; + unsigned int max = n; + int success; + + do { + /* Reset n to the initial burst count */ + n = max; + + *old_head = r->prod.head; + + /* add rmb barrier to avoid load/load reorder in weak + * memory model. It is noop on x86 + */ + rte_smp_rmb(); + + /* + * The subtraction is done between two unsigned 32bits value + * (the result is always modulo 32 bits even if we have + * *old_head > cons_tail). So 'free_entries' is always between 0 + * and capacity (which is < size). + */ + *free_entries = (capacity + r->cons.tail - *old_head); + + /* check that we have enough room in ring */ + if (unlikely(n > *free_entries)) + n = (behavior == RTE_RING_QUEUE_FIXED) ? + 0 : *free_entries; + + if (n == 0) + return 0; + + *new_head = *old_head + n; + if (is_sp) + r->prod.head = *new_head, success = 1; + else + success = rte_atomic32_cmpset(&r->prod.head, + *old_head, *new_head); + } while (unlikely(success == 0)); + return n; +} + +/** + * @internal This function updates the consumer head for dequeue + * + * @param r + * A pointer to the ring structure + * @param is_sc + * Indicates whether multi-consumer path is needed or not + * @param n + * The number of elements we will want to enqueue, i.e. how far should the + * head be moved + * @param behavior + * RTE_RING_QUEUE_FIXED: Dequeue a fixed number of items from a ring + * RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring + * @param old_head + * Returns head value as it was before the move, i.e. where dequeue starts + * @param new_head + * Returns the current/new head value i.e. where dequeue finishes + * @param entries + * Returns the number of entries in the ring BEFORE head was moved + * @return + * - Actual number of objects dequeued. + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only. + */ +static __rte_always_inline unsigned int +__rte_ring_move_cons_head(struct rte_ring *r, unsigned int is_sc, + unsigned int n, enum rte_ring_queue_behavior behavior, + uint32_t *old_head, uint32_t *new_head, + uint32_t *entries) +{ + unsigned int max = n; + int success; + + /* move cons.head atomically */ + do { + /* Restore n as it may change every loop */ + n = max; + + *old_head = r->cons.head; + + /* add rmb barrier to avoid load/load reorder in weak + * memory model. It is noop on x86 + */ + rte_smp_rmb(); + + /* The subtraction is done between two unsigned 32bits value + * (the result is always modulo 32 bits even if we have + * cons_head > prod_tail). So 'entries' is always between 0 + * and size(ring)-1. + */ + *entries = (r->prod.tail - *old_head); + + /* Set the actual entries for dequeue */ + if (n > *entries) + n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries; + + if (unlikely(n == 0)) + return 0; + + *new_head = *old_head + n; + if (is_sc) + r->cons.head = *new_head, success = 1; + else + success = rte_atomic32_cmpset(&r->cons.head, *old_head, + *new_head); + } while (unlikely(success == 0)); + return n; +} + +#endif /* _RTE_RING_GENERIC_H_ */ diff --git a/src/spdk/dpdk/lib/librte_ring/rte_ring_version.map b/src/spdk/dpdk/lib/librte_ring/rte_ring_version.map new file mode 100644 index 00000000..d935efd0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_ring/rte_ring_version.map @@ -0,0 +1,19 @@ +DPDK_2.0 { + global: + + rte_ring_create; + rte_ring_dump; + rte_ring_get_memsize; + rte_ring_init; + rte_ring_list_dump; + rte_ring_lookup; + + local: *; +}; + +DPDK_2.2 { + global: + + rte_ring_free; + +} DPDK_2.0; diff --git a/src/spdk/dpdk/lib/librte_sched/Makefile b/src/spdk/dpdk/lib/librte_sched/Makefile new file mode 100644 index 00000000..55d9c698 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_sched.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +CFLAGS_rte_red.o := -D_GNU_SOURCE + +LDLIBS += -lm +LDLIBS += -lrt +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_net +LDLIBS += -lrte_timer + +EXPORT_MAP := rte_sched_version.map + +LIBABIVER := 1 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_SCHED) += rte_sched.c rte_red.c rte_approx.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include := rte_sched.h rte_sched_common.h rte_red.h rte_approx.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_sched/meson.build b/src/spdk/dpdk/lib/librte_sched/meson.build new file mode 100644 index 00000000..f85d64df --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_sched.c', 'rte_red.c', 'rte_approx.c') +headers = files('rte_sched.h', 'rte_sched_common.h', + 'rte_red.h', 'rte_approx.h') +deps += ['mbuf', 'meter'] diff --git a/src/spdk/dpdk/lib/librte_sched/rte_approx.c b/src/spdk/dpdk/lib/librte_sched/rte_approx.c new file mode 100644 index 00000000..30620b83 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_approx.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include "rte_approx.h" + +/* + * Based on paper "Approximating Rational Numbers by Fractions" by Michal + * Forisek forisek@dcs.fmph.uniba.sk + * + * Given a rational number alpha with 0 < alpha < 1 and a precision d, the goal + * is to find positive integers p, q such that alpha - d < p/q < alpha + d, and + * q is minimal. + * + * http://people.ksp.sk/~misof/publications/2007approx.pdf + */ + +/* fraction comparison: compare (a/b) and (c/d) */ +static inline uint32_t +less(uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ + return a*d < b*c; +} + +static inline uint32_t +less_or_equal(uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ + return a*d <= b*c; +} + +/* check whether a/b is a valid approximation */ +static inline uint32_t +matches(uint32_t a, uint32_t b, + uint32_t alpha_num, uint32_t d_num, uint32_t denum) +{ + if (less_or_equal(a, b, alpha_num - d_num, denum)) + return 0; + + if (less(a ,b, alpha_num + d_num, denum)) + return 1; + + return 0; +} + +static inline void +find_exact_solution_left(uint32_t p_a, uint32_t q_a, uint32_t p_b, uint32_t q_b, + uint32_t alpha_num, uint32_t d_num, uint32_t denum, uint32_t *p, uint32_t *q) +{ + uint32_t k_num = denum * p_b - (alpha_num + d_num) * q_b; + uint32_t k_denum = (alpha_num + d_num) * q_a - denum * p_a; + uint32_t k = (k_num / k_denum) + 1; + + *p = p_b + k * p_a; + *q = q_b + k * q_a; +} + +static inline void +find_exact_solution_right(uint32_t p_a, uint32_t q_a, uint32_t p_b, uint32_t q_b, + uint32_t alpha_num, uint32_t d_num, uint32_t denum, uint32_t *p, uint32_t *q) +{ + uint32_t k_num = - denum * p_b + (alpha_num - d_num) * q_b; + uint32_t k_denum = - (alpha_num - d_num) * q_a + denum * p_a; + uint32_t k = (k_num / k_denum) + 1; + + *p = p_b + k * p_a; + *q = q_b + k * q_a; +} + +static int +find_best_rational_approximation(uint32_t alpha_num, uint32_t d_num, uint32_t denum, uint32_t *p, uint32_t *q) +{ + uint32_t p_a, q_a, p_b, q_b; + + /* check assumptions on the inputs */ + if (!((0 < d_num) && (d_num < alpha_num) && (alpha_num < denum) && (d_num + alpha_num < denum))) { + return -1; + } + + /* set initial bounds for the search */ + p_a = 0; + q_a = 1; + p_b = 1; + q_b = 1; + + while (1) { + uint32_t new_p_a, new_q_a, new_p_b, new_q_b; + uint32_t x_num, x_denum, x; + int aa, bb; + + /* compute the number of steps to the left */ + x_num = denum * p_b - alpha_num * q_b; + x_denum = - denum * p_a + alpha_num * q_a; + x = (x_num + x_denum - 1) / x_denum; /* x = ceil(x_num / x_denum) */ + + /* check whether we have a valid approximation */ + aa = matches(p_b + x * p_a, q_b + x * q_a, alpha_num, d_num, denum); + bb = matches(p_b + (x-1) * p_a, q_b + (x - 1) * q_a, alpha_num, d_num, denum); + if (aa || bb) { + find_exact_solution_left(p_a, q_a, p_b, q_b, alpha_num, d_num, denum, p, q); + return 0; + } + + /* update the interval */ + new_p_a = p_b + (x - 1) * p_a ; + new_q_a = q_b + (x - 1) * q_a; + new_p_b = p_b + x * p_a ; + new_q_b = q_b + x * q_a; + + p_a = new_p_a ; + q_a = new_q_a; + p_b = new_p_b ; + q_b = new_q_b; + + /* compute the number of steps to the right */ + x_num = alpha_num * q_b - denum * p_b; + x_denum = - alpha_num * q_a + denum * p_a; + x = (x_num + x_denum - 1) / x_denum; /* x = ceil(x_num / x_denum) */ + + /* check whether we have a valid approximation */ + aa = matches(p_b + x * p_a, q_b + x * q_a, alpha_num, d_num, denum); + bb = matches(p_b + (x - 1) * p_a, q_b + (x - 1) * q_a, alpha_num, d_num, denum); + if (aa || bb) { + find_exact_solution_right(p_a, q_a, p_b, q_b, alpha_num, d_num, denum, p, q); + return 0; + } + + /* update the interval */ + new_p_a = p_b + (x - 1) * p_a; + new_q_a = q_b + (x - 1) * q_a; + new_p_b = p_b + x * p_a; + new_q_b = q_b + x * q_a; + + p_a = new_p_a; + q_a = new_q_a; + p_b = new_p_b; + q_b = new_q_b; + } +} + +int rte_approx(double alpha, double d, uint32_t *p, uint32_t *q) +{ + uint32_t alpha_num, d_num, denum; + + /* Check input arguments */ + if (!((0.0 < d) && (d < alpha) && (alpha < 1.0))) { + return -1; + } + + if ((p == NULL) || (q == NULL)) { + return -2; + } + + /* Compute alpha_num, d_num and denum */ + denum = 1; + while (d < 1) { + alpha *= 10; + d *= 10; + denum *= 10; + } + alpha_num = (uint32_t) alpha; + d_num = (uint32_t) d; + + /* Perform approximation */ + return find_best_rational_approximation(alpha_num, d_num, denum, p, q); +} diff --git a/src/spdk/dpdk/lib/librte_sched/rte_approx.h b/src/spdk/dpdk/lib/librte_sched/rte_approx.h new file mode 100644 index 00000000..0244d98f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_approx.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_APPROX_H__ +#define __INCLUDE_RTE_APPROX_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Rational Approximation + * + * Given a rational number alpha with 0 < alpha < 1 and a precision d, the goal + * is to find positive integers p, q such that alpha - d < p/q < alpha + d, and + * q is minimal. + * + ***/ + +#include + +/** + * Find best rational approximation + * + * @param alpha + * Rational number to approximate + * @param d + * Precision for the rational approximation + * @param p + * Pointer to pre-allocated space where the numerator of the rational + * approximation will be stored when operation is successful + * @param q + * Pointer to pre-allocated space where the denominator of the rational + * approximation will be stored when operation is successful + * @return + * 0 upon success, error code otherwise + */ +int rte_approx(double alpha, double d, uint32_t *p, uint32_t *q); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_APPROX_H__ */ diff --git a/src/spdk/dpdk/lib/librte_sched/rte_red.c b/src/spdk/dpdk/lib/librte_sched/rte_red.c new file mode 100644 index 00000000..45a452f6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_red.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include "rte_red.h" +#include +#include + +#ifdef __INTEL_COMPILER +#pragma warning(disable:2259) /* conversion may lose significant bits */ +#endif + +static int rte_red_init_done = 0; /**< Flag to indicate that global initialisation is done */ +uint32_t rte_red_rand_val = 0; /**< Random value cache */ +uint32_t rte_red_rand_seed = 0; /**< Seed for random number generation */ + +/** + * table[i] = log2(1-Wq) * Scale * -1 + * Wq = 1/(2^i) + */ +uint16_t rte_red_log2_1_minus_Wq[RTE_RED_WQ_LOG2_NUM]; + +/** + * table[i] = 2^(i/16) * Scale + */ +uint16_t rte_red_pow2_frac_inv[16]; + +/** + * @brief Initialize tables used to compute average + * queue size when queue is empty. + */ +static void +__rte_red_init_tables(void) +{ + uint32_t i = 0; + double scale = 0.0; + double table_size = 0.0; + + scale = (double)(1 << RTE_RED_SCALING); + table_size = (double)(RTE_DIM(rte_red_pow2_frac_inv)); + + for (i = 0; i < RTE_DIM(rte_red_pow2_frac_inv); i++) { + double m = (double)i; + + rte_red_pow2_frac_inv[i] = (uint16_t) round(scale / pow(2, m / table_size)); + } + + scale = 1024.0; + + RTE_ASSERT(RTE_RED_WQ_LOG2_NUM == RTE_DIM(rte_red_log2_1_minus_Wq)); + + for (i = RTE_RED_WQ_LOG2_MIN; i <= RTE_RED_WQ_LOG2_MAX; i++) { + double n = (double)i; + double Wq = pow(2, -n); + uint32_t index = i - RTE_RED_WQ_LOG2_MIN; + + rte_red_log2_1_minus_Wq[index] = (uint16_t) round(-1.0 * scale * log2(1.0 - Wq)); + /** + * Table entry of zero, corresponds to a Wq of zero + * which is not valid (avg would remain constant no + * matter how long the queue is empty). So we have + * to check for zero and round up to one. + */ + if (rte_red_log2_1_minus_Wq[index] == 0) { + rte_red_log2_1_minus_Wq[index] = 1; + } + } +} + +int +rte_red_rt_data_init(struct rte_red *red) +{ + if (red == NULL) + return -1; + + red->avg = 0; + red->count = 0; + red->q_time = 0; + return 0; +} + +int +rte_red_config_init(struct rte_red_config *red_cfg, + const uint16_t wq_log2, + const uint16_t min_th, + const uint16_t max_th, + const uint16_t maxp_inv) +{ + if (red_cfg == NULL) { + return -1; + } + if (max_th > RTE_RED_MAX_TH_MAX) { + return -2; + } + if (min_th >= max_th) { + return -3; + } + if (wq_log2 > RTE_RED_WQ_LOG2_MAX) { + return -4; + } + if (wq_log2 < RTE_RED_WQ_LOG2_MIN) { + return -5; + } + if (maxp_inv < RTE_RED_MAXP_INV_MIN) { + return -6; + } + if (maxp_inv > RTE_RED_MAXP_INV_MAX) { + return -7; + } + + /** + * Initialize the RED module if not already done + */ + if (!rte_red_init_done) { + rte_red_rand_seed = rte_rand(); + rte_red_rand_val = rte_fast_rand(); + __rte_red_init_tables(); + rte_red_init_done = 1; + } + + red_cfg->min_th = ((uint32_t) min_th) << (wq_log2 + RTE_RED_SCALING); + red_cfg->max_th = ((uint32_t) max_th) << (wq_log2 + RTE_RED_SCALING); + red_cfg->pa_const = (2 * (max_th - min_th) * maxp_inv) << RTE_RED_SCALING; + red_cfg->maxp_inv = maxp_inv; + red_cfg->wq_log2 = wq_log2; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_sched/rte_red.h b/src/spdk/dpdk/lib/librte_sched/rte_red.h new file mode 100644 index 00000000..36273cac --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_red.h @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __RTE_RED_H_INCLUDED__ +#define __RTE_RED_H_INCLUDED__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Random Early Detection (RED) + * + * + ***/ + +#include +#include +#include +#include +#include +#include + +#define RTE_RED_SCALING 10 /**< Fraction size for fixed-point */ +#define RTE_RED_S (1 << 22) /**< Packet size multiplied by number of leaf queues */ +#define RTE_RED_MAX_TH_MAX 1023 /**< Max threshold limit in fixed point format */ +#define RTE_RED_WQ_LOG2_MIN 1 /**< Min inverse filter weight value */ +#define RTE_RED_WQ_LOG2_MAX 12 /**< Max inverse filter weight value */ +#define RTE_RED_MAXP_INV_MIN 1 /**< Min inverse mark probability value */ +#define RTE_RED_MAXP_INV_MAX 255 /**< Max inverse mark probability value */ +#define RTE_RED_2POW16 (1<<16) /**< 2 power 16 */ +#define RTE_RED_INT16_NBITS (sizeof(uint16_t) * CHAR_BIT) +#define RTE_RED_WQ_LOG2_NUM (RTE_RED_WQ_LOG2_MAX - RTE_RED_WQ_LOG2_MIN + 1) + +/** + * Externs + * + */ +extern uint32_t rte_red_rand_val; +extern uint32_t rte_red_rand_seed; +extern uint16_t rte_red_log2_1_minus_Wq[RTE_RED_WQ_LOG2_NUM]; +extern uint16_t rte_red_pow2_frac_inv[16]; + +/** + * RED configuration parameters passed by user + * + */ +struct rte_red_params { + uint16_t min_th; /**< Minimum threshold for queue (max_th) */ + uint16_t max_th; /**< Maximum threshold for queue (max_th) */ + uint16_t maxp_inv; /**< Inverse of packet marking probability maximum value (maxp = 1 / maxp_inv) */ + uint16_t wq_log2; /**< Negated log2 of queue weight (wq = 1 / (2 ^ wq_log2)) */ +}; + +/** + * RED configuration parameters + */ +struct rte_red_config { + uint32_t min_th; /**< min_th scaled in fixed-point format */ + uint32_t max_th; /**< max_th scaled in fixed-point format */ + uint32_t pa_const; /**< Precomputed constant value used for pa calculation (scaled in fixed-point format) */ + uint8_t maxp_inv; /**< maxp_inv */ + uint8_t wq_log2; /**< wq_log2 */ +}; + +/** + * RED run-time data + */ +struct rte_red { + uint32_t avg; /**< Average queue size (avg), scaled in fixed-point format */ + uint32_t count; /**< Number of packets since last marked packet (count) */ + uint64_t q_time; /**< Start of the queue idle time (q_time) */ +}; + +/** + * @brief Initialises run-time data + * + * @param red [in,out] data pointer to RED runtime data + * + * @return Operation status + * @retval 0 success + * @retval !0 error + */ +int +rte_red_rt_data_init(struct rte_red *red); + +/** + * @brief Configures a single RED configuration parameter structure. + * + * @param red_cfg [in,out] config pointer to a RED configuration parameter structure + * @param wq_log2 [in] log2 of the filter weight, valid range is: + * RTE_RED_WQ_LOG2_MIN <= wq_log2 <= RTE_RED_WQ_LOG2_MAX + * @param min_th [in] queue minimum threshold in number of packets + * @param max_th [in] queue maximum threshold in number of packets + * @param maxp_inv [in] inverse maximum mark probability + * + * @return Operation status + * @retval 0 success + * @retval !0 error + */ +int +rte_red_config_init(struct rte_red_config *red_cfg, + const uint16_t wq_log2, + const uint16_t min_th, + const uint16_t max_th, + const uint16_t maxp_inv); + +/** + * @brief Generate random number for RED + * + * Implementation based on: + * http://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/ + * + * 10 bit shift has been found through empirical tests (was 16). + * + * @return Random number between 0 and (2^22 - 1) + */ +static inline uint32_t +rte_fast_rand(void) +{ + rte_red_rand_seed = (214013 * rte_red_rand_seed) + 2531011; + return rte_red_rand_seed >> 10; +} + +/** + * @brief calculate factor to scale average queue size when queue + * becomes empty + * + * @param wq_log2 [in] where EWMA filter weight wq = 1/(2 ^ wq_log2) + * @param m [in] exponent in the computed value (1 - wq) ^ m + * + * @return computed value + * @retval ((1 - wq) ^ m) scaled in fixed-point format + */ +static inline uint16_t +__rte_red_calc_qempty_factor(uint8_t wq_log2, uint16_t m) +{ + uint32_t n = 0; + uint32_t f = 0; + + /** + * Basic math tells us that: + * a^b = 2^(b * log2(a) ) + * + * in our case: + * a = (1-Wq) + * b = m + * Wq = 1/ (2^log2n) + * + * So we are computing this equation: + * factor = 2 ^ ( m * log2(1-Wq)) + * + * First we are computing: + * n = m * log2(1-Wq) + * + * To avoid dealing with signed numbers log2 values are positive + * but they should be negative because (1-Wq) is always < 1. + * Contents of log2 table values are also scaled for precision. + */ + + n = m * rte_red_log2_1_minus_Wq[wq_log2 - RTE_RED_WQ_LOG2_MIN]; + + /** + * The tricky part is computing 2^n, for this I split n into + * integer part and fraction part. + * f - is fraction part of n + * n - is integer part of original n + * + * Now using basic math we compute 2^n: + * 2^(f+n) = 2^f * 2^n + * 2^f - we use lookup table + * 2^n - can be replaced with bit shift right operations + */ + + f = (n >> 6) & 0xf; + n >>= 10; + + if (n < RTE_RED_SCALING) + return (uint16_t) ((rte_red_pow2_frac_inv[f] + (1 << (n - 1))) >> n); + + return 0; +} + +/** + * @brief Updates queue average in condition when queue is empty + * + * Note: packet is never dropped in this particular case. + * + * @param red_cfg [in] config pointer to a RED configuration parameter structure + * @param red [in,out] data pointer to RED runtime data + * @param time [in] current time stamp + * + * @return Operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet based on max threshold criterion + * @retval 2 drop the packet based on mark probability criterion + */ +static inline int +rte_red_enqueue_empty(const struct rte_red_config *red_cfg, + struct rte_red *red, + const uint64_t time) +{ + uint64_t time_diff = 0, m = 0; + + RTE_ASSERT(red_cfg != NULL); + RTE_ASSERT(red != NULL); + + red->count ++; + + /** + * We compute avg but we don't compare avg against + * min_th or max_th, nor calculate drop probability + */ + time_diff = time - red->q_time; + + /** + * m is the number of packets that might have arrived while the queue was empty. + * In this case we have time stamps provided by scheduler in byte units (bytes + * transmitted on network port). Such time stamp translates into time units as + * port speed is fixed but such approach simplifies the code. + */ + m = time_diff / RTE_RED_S; + + /** + * Check that m will fit into 16-bit unsigned integer + */ + if (m >= RTE_RED_2POW16) { + red->avg = 0; + } else { + red->avg = (red->avg >> RTE_RED_SCALING) * __rte_red_calc_qempty_factor(red_cfg->wq_log2, (uint16_t) m); + } + + return 0; +} + +/** + * Drop probability (Sally Floyd and Van Jacobson): + * + * pb = (1 / maxp_inv) * (avg - min_th) / (max_th - min_th) + * pa = pb / (2 - count * pb) + * + * + * (1 / maxp_inv) * (avg - min_th) + * --------------------------------- + * max_th - min_th + * pa = ----------------------------------------------- + * count * (1 / maxp_inv) * (avg - min_th) + * 2 - ----------------------------------------- + * max_th - min_th + * + * + * avg - min_th + * pa = ----------------------------------------------------------- + * 2 * (max_th - min_th) * maxp_inv - count * (avg - min_th) + * + * + * We define pa_const as: pa_const = 2 * (max_th - min_th) * maxp_inv. Then: + * + * + * avg - min_th + * pa = ----------------------------------- + * pa_const - count * (avg - min_th) + */ + +/** + * @brief make a decision to drop or enqueue a packet based on mark probability + * criteria + * + * @param red_cfg [in] config pointer to structure defining RED parameters + * @param red [in,out] data pointer to RED runtime data + * + * @return operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet + */ +static inline int +__rte_red_drop(const struct rte_red_config *red_cfg, struct rte_red *red) +{ + uint32_t pa_num = 0; /* numerator of drop-probability */ + uint32_t pa_den = 0; /* denominator of drop-probability */ + uint32_t pa_num_count = 0; + + pa_num = (red->avg - red_cfg->min_th) >> (red_cfg->wq_log2); + + pa_num_count = red->count * pa_num; + + if (red_cfg->pa_const <= pa_num_count) + return 1; + + pa_den = red_cfg->pa_const - pa_num_count; + + /* If drop, generate and save random number to be used next time */ + if (unlikely((rte_red_rand_val % pa_den) < pa_num)) { + rte_red_rand_val = rte_fast_rand(); + + return 1; + } + + /* No drop */ + return 0; +} + +/** + * @brief Decides if new packet should be enqeued or dropped in queue non-empty case + * + * @param red_cfg [in] config pointer to a RED configuration parameter structure + * @param red [in,out] data pointer to RED runtime data + * @param q [in] current queue size (measured in packets) + * + * @return Operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet based on max threshold criterion + * @retval 2 drop the packet based on mark probability criterion + */ +static inline int +rte_red_enqueue_nonempty(const struct rte_red_config *red_cfg, + struct rte_red *red, + const unsigned q) +{ + RTE_ASSERT(red_cfg != NULL); + RTE_ASSERT(red != NULL); + + /** + * EWMA filter (Sally Floyd and Van Jacobson): + * avg = (1 - wq) * avg + wq * q + * avg = avg + q * wq - avg * wq + * + * We select: wq = 2^(-n). Let scaled version of avg be: avg_s = avg * 2^(N+n). We get: + * avg_s = avg_s + q * 2^N - avg_s * 2^(-n) + * + * By using shift left/right operations, we get: + * avg_s = avg_s + (q << N) - (avg_s >> n) + * avg_s += (q << N) - (avg_s >> n) + */ + + /* avg update */ + red->avg += (q << RTE_RED_SCALING) - (red->avg >> red_cfg->wq_log2); + + /* avg < min_th: do not mark the packet */ + if (red->avg < red_cfg->min_th) { + red->count ++; + return 0; + } + + /* min_th <= avg < max_th: mark the packet with pa probability */ + if (red->avg < red_cfg->max_th) { + if (!__rte_red_drop(red_cfg, red)) { + red->count ++; + return 0; + } + + red->count = 0; + return 2; + } + + /* max_th <= avg: always mark the packet */ + red->count = 0; + return 1; +} + +/** + * @brief Decides if new packet should be enqeued or dropped + * Updates run time data based on new queue size value. + * Based on new queue average and RED configuration parameters + * gives verdict whether to enqueue or drop the packet. + * + * @param red_cfg [in] config pointer to a RED configuration parameter structure + * @param red [in,out] data pointer to RED runtime data + * @param q [in] updated queue size in packets + * @param time [in] current time stamp + * + * @return Operation status + * @retval 0 enqueue the packet + * @retval 1 drop the packet based on max threshold criteria + * @retval 2 drop the packet based on mark probability criteria + */ +static inline int +rte_red_enqueue(const struct rte_red_config *red_cfg, + struct rte_red *red, + const unsigned q, + const uint64_t time) +{ + RTE_ASSERT(red_cfg != NULL); + RTE_ASSERT(red != NULL); + + if (q != 0) { + return rte_red_enqueue_nonempty(red_cfg, red, q); + } else { + return rte_red_enqueue_empty(red_cfg, red, time); + } +} + +/** + * @brief Callback to records time that queue became empty + * + * @param red [in,out] data pointer to RED runtime data + * @param time [in] current time stamp + */ +static inline void +rte_red_mark_queue_empty(struct rte_red *red, const uint64_t time) +{ + red->q_time = time; +} + +#ifdef __cplusplus +} +#endif + +#endif /* __RTE_RED_H_INCLUDED__ */ diff --git a/src/spdk/dpdk/lib/librte_sched/rte_sched.c b/src/spdk/dpdk/lib/librte_sched/rte_sched.c new file mode 100644 index 00000000..9269e5c7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_sched.c @@ -0,0 +1,2242 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_sched.h" +#include "rte_sched_common.h" +#include "rte_approx.h" + +#ifdef __INTEL_COMPILER +#pragma warning(disable:2259) /* conversion may lose significant bits */ +#endif + +#ifdef RTE_SCHED_VECTOR +#include + +#ifdef RTE_ARCH_X86 +#define SCHED_VECTOR_SSE4 +#elif defined(RTE_MACHINE_CPUFLAG_NEON) +#define SCHED_VECTOR_NEON +#endif + +#endif + +#define RTE_SCHED_TB_RATE_CONFIG_ERR (1e-7) +#define RTE_SCHED_WRR_SHIFT 3 +#define RTE_SCHED_GRINDER_PCACHE_SIZE (64 / RTE_SCHED_QUEUES_PER_PIPE) +#define RTE_SCHED_PIPE_INVALID UINT32_MAX +#define RTE_SCHED_BMP_POS_INVALID UINT32_MAX + +/* Scaling for cycles_per_byte calculation + * Chosen so that minimum rate is 480 bit/sec + */ +#define RTE_SCHED_TIME_SHIFT 8 + +struct rte_sched_subport { + /* Token bucket (TB) */ + uint64_t tb_time; /* time of last update */ + uint32_t tb_period; + uint32_t tb_credits_per_period; + uint32_t tb_size; + uint32_t tb_credits; + + /* Traffic classes (TCs) */ + uint64_t tc_time; /* time of next update */ + uint32_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint32_t tc_period; + + /* TC oversubscription */ + uint32_t tc_ov_wm; + uint32_t tc_ov_wm_min; + uint32_t tc_ov_wm_max; + uint8_t tc_ov_period_id; + uint8_t tc_ov; + uint32_t tc_ov_n; + double tc_ov_rate; + + /* Statistics */ + struct rte_sched_subport_stats stats; +}; + +struct rte_sched_pipe_profile { + /* Token bucket (TB) */ + uint32_t tb_period; + uint32_t tb_credits_per_period; + uint32_t tb_size; + + /* Pipe traffic classes */ + uint32_t tc_period; + uint32_t tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint8_t tc_ov_weight; + + /* Pipe queues */ + uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_PIPE]; +}; + +struct rte_sched_pipe { + /* Token bucket (TB) */ + uint64_t tb_time; /* time of last update */ + uint32_t tb_credits; + + /* Pipe profile and flags */ + uint32_t profile; + + /* Traffic classes (TCs) */ + uint64_t tc_time; /* time of next update */ + uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + + /* Weighted Round Robin (WRR) */ + uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; + + /* TC oversubscription */ + uint32_t tc_ov_credits; + uint8_t tc_ov_period_id; + uint8_t reserved[3]; +} __rte_cache_aligned; + +struct rte_sched_queue { + uint16_t qw; + uint16_t qr; +}; + +struct rte_sched_queue_extra { + struct rte_sched_queue_stats stats; +#ifdef RTE_SCHED_RED + struct rte_red red; +#endif +}; + +enum grinder_state { + e_GRINDER_PREFETCH_PIPE = 0, + e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS, + e_GRINDER_PREFETCH_MBUF, + e_GRINDER_READ_MBUF +}; + +/* + * Path through the scheduler hierarchy used by the scheduler enqueue + * operation to identify the destination queue for the current + * packet. Stored in the field pkt.hash.sched of struct rte_mbuf of + * each packet, typically written by the classification stage and read + * by scheduler enqueue. + */ +struct rte_sched_port_hierarchy { + uint16_t queue:2; /**< Queue ID (0 .. 3) */ + uint16_t traffic_class:2; /**< Traffic class ID (0 .. 3)*/ + uint32_t color:2; /**< Color */ + uint16_t unused:10; + uint16_t subport; /**< Subport ID */ + uint32_t pipe; /**< Pipe ID */ +}; + +struct rte_sched_grinder { + /* Pipe cache */ + uint16_t pcache_qmask[RTE_SCHED_GRINDER_PCACHE_SIZE]; + uint32_t pcache_qindex[RTE_SCHED_GRINDER_PCACHE_SIZE]; + uint32_t pcache_w; + uint32_t pcache_r; + + /* Current pipe */ + enum grinder_state state; + uint32_t productive; + uint32_t pindex; + struct rte_sched_subport *subport; + struct rte_sched_pipe *pipe; + struct rte_sched_pipe_profile *pipe_params; + + /* TC cache */ + uint8_t tccache_qmask[4]; + uint32_t tccache_qindex[4]; + uint32_t tccache_w; + uint32_t tccache_r; + + /* Current TC */ + uint32_t tc_index; + struct rte_sched_queue *queue[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + struct rte_mbuf **qbase[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint32_t qindex[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint16_t qsize; + uint32_t qmask; + uint32_t qpos; + struct rte_mbuf *pkt; + + /* WRR */ + uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; + uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; + uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; +}; + +struct rte_sched_port { + /* User parameters */ + uint32_t n_subports_per_port; + uint32_t n_pipes_per_subport; + uint32_t rate; + uint32_t mtu; + uint32_t frame_overhead; + uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint32_t n_pipe_profiles; + uint32_t pipe_tc3_rate_max; +#ifdef RTE_SCHED_RED + struct rte_red_config red_config[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][e_RTE_METER_COLORS]; +#endif + + /* Timing */ + uint64_t time_cpu_cycles; /* Current CPU time measured in CPU cyles */ + uint64_t time_cpu_bytes; /* Current CPU time measured in bytes */ + uint64_t time; /* Current NIC TX time measured in bytes */ + struct rte_reciprocal inv_cycles_per_byte; /* CPU cycles per byte */ + + /* Scheduling loop detection */ + uint32_t pipe_loop; + uint32_t pipe_exhaustion; + + /* Bitmap */ + struct rte_bitmap *bmp; + uint32_t grinder_base_bmp_pos[RTE_SCHED_PORT_N_GRINDERS] __rte_aligned_16; + + /* Grinders */ + struct rte_sched_grinder grinder[RTE_SCHED_PORT_N_GRINDERS]; + uint32_t busy_grinders; + struct rte_mbuf **pkts_out; + uint32_t n_pkts_out; + + /* Queue base calculation */ + uint32_t qsize_add[RTE_SCHED_QUEUES_PER_PIPE]; + uint32_t qsize_sum; + + /* Large data structures */ + struct rte_sched_subport *subport; + struct rte_sched_pipe *pipe; + struct rte_sched_queue *queue; + struct rte_sched_queue_extra *queue_extra; + struct rte_sched_pipe_profile *pipe_profiles; + uint8_t *bmp_array; + struct rte_mbuf **queue_array; + uint8_t memory[0] __rte_cache_aligned; +} __rte_cache_aligned; + +enum rte_sched_port_array { + e_RTE_SCHED_PORT_ARRAY_SUBPORT = 0, + e_RTE_SCHED_PORT_ARRAY_PIPE, + e_RTE_SCHED_PORT_ARRAY_QUEUE, + e_RTE_SCHED_PORT_ARRAY_QUEUE_EXTRA, + e_RTE_SCHED_PORT_ARRAY_PIPE_PROFILES, + e_RTE_SCHED_PORT_ARRAY_BMP_ARRAY, + e_RTE_SCHED_PORT_ARRAY_QUEUE_ARRAY, + e_RTE_SCHED_PORT_ARRAY_TOTAL, +}; + +#ifdef RTE_SCHED_COLLECT_STATS + +static inline uint32_t +rte_sched_port_queues_per_subport(struct rte_sched_port *port) +{ + return RTE_SCHED_QUEUES_PER_PIPE * port->n_pipes_per_subport; +} + +#endif + +static inline uint32_t +rte_sched_port_queues_per_port(struct rte_sched_port *port) +{ + return RTE_SCHED_QUEUES_PER_PIPE * port->n_pipes_per_subport * port->n_subports_per_port; +} + +static inline struct rte_mbuf ** +rte_sched_port_qbase(struct rte_sched_port *port, uint32_t qindex) +{ + uint32_t pindex = qindex >> 4; + uint32_t qpos = qindex & 0xF; + + return (port->queue_array + pindex * + port->qsize_sum + port->qsize_add[qpos]); +} + +static inline uint16_t +rte_sched_port_qsize(struct rte_sched_port *port, uint32_t qindex) +{ + uint32_t tc = (qindex >> 2) & 0x3; + + return port->qsize[tc]; +} + +static int +pipe_profile_check(struct rte_sched_pipe_params *params, + uint32_t rate) +{ + uint32_t i; + + /* Pipe parameters */ + if (params == NULL) + return -10; + + /* TB rate: non-zero, not greater than port rate */ + if (params->tb_rate == 0 || + params->tb_rate > rate) + return -11; + + /* TB size: non-zero */ + if (params->tb_size == 0) + return -12; + + /* TC rate: non-zero, less than pipe rate */ + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + if (params->tc_rate[i] == 0 || + params->tc_rate[i] > params->tb_rate) + return -13; + } + + /* TC period: non-zero */ + if (params->tc_period == 0) + return -14; + +#ifdef RTE_SCHED_SUBPORT_TC_OV + /* TC3 oversubscription weight: non-zero */ + if (params->tc_ov_weight == 0) + return -15; +#endif + + /* Queue WRR weights: non-zero */ + for (i = 0; i < RTE_SCHED_QUEUES_PER_PIPE; i++) { + if (params->wrr_weights[i] == 0) + return -16; + } + + return 0; +} + +static int +rte_sched_port_check_params(struct rte_sched_port_params *params) +{ + uint32_t i; + + if (params == NULL) + return -1; + + /* socket */ + if ((params->socket < 0) || (params->socket >= RTE_MAX_NUMA_NODES)) + return -3; + + /* rate */ + if (params->rate == 0) + return -4; + + /* mtu */ + if (params->mtu == 0) + return -5; + + /* n_subports_per_port: non-zero, limited to 16 bits, power of 2 */ + if (params->n_subports_per_port == 0 || + params->n_subports_per_port > 1u << 16 || + !rte_is_power_of_2(params->n_subports_per_port)) + return -6; + + /* n_pipes_per_subport: non-zero, power of 2 */ + if (params->n_pipes_per_subport == 0 || + !rte_is_power_of_2(params->n_pipes_per_subport)) + return -7; + + /* qsize: non-zero, power of 2, + * no bigger than 32K (due to 16-bit read/write pointers) + */ + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + uint16_t qsize = params->qsize[i]; + + if (qsize == 0 || !rte_is_power_of_2(qsize)) + return -8; + } + + /* pipe_profiles and n_pipe_profiles */ + if (params->pipe_profiles == NULL || + params->n_pipe_profiles == 0 || + params->n_pipe_profiles > RTE_SCHED_PIPE_PROFILES_PER_PORT) + return -9; + + for (i = 0; i < params->n_pipe_profiles; i++) { + struct rte_sched_pipe_params *p = params->pipe_profiles + i; + int status; + + status = pipe_profile_check(p, params->rate); + if (status != 0) + return status; + } + + return 0; +} + +static uint32_t +rte_sched_port_get_array_base(struct rte_sched_port_params *params, enum rte_sched_port_array array) +{ + uint32_t n_subports_per_port = params->n_subports_per_port; + uint32_t n_pipes_per_subport = params->n_pipes_per_subport; + uint32_t n_pipes_per_port = n_pipes_per_subport * n_subports_per_port; + uint32_t n_queues_per_port = RTE_SCHED_QUEUES_PER_PIPE * n_pipes_per_subport * n_subports_per_port; + + uint32_t size_subport = n_subports_per_port * sizeof(struct rte_sched_subport); + uint32_t size_pipe = n_pipes_per_port * sizeof(struct rte_sched_pipe); + uint32_t size_queue = n_queues_per_port * sizeof(struct rte_sched_queue); + uint32_t size_queue_extra + = n_queues_per_port * sizeof(struct rte_sched_queue_extra); + uint32_t size_pipe_profiles + = RTE_SCHED_PIPE_PROFILES_PER_PORT * sizeof(struct rte_sched_pipe_profile); + uint32_t size_bmp_array = rte_bitmap_get_memory_footprint(n_queues_per_port); + uint32_t size_per_pipe_queue_array, size_queue_array; + + uint32_t base, i; + + size_per_pipe_queue_array = 0; + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + size_per_pipe_queue_array += RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + * params->qsize[i] * sizeof(struct rte_mbuf *); + } + size_queue_array = n_pipes_per_port * size_per_pipe_queue_array; + + base = 0; + + if (array == e_RTE_SCHED_PORT_ARRAY_SUBPORT) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_subport); + + if (array == e_RTE_SCHED_PORT_ARRAY_PIPE) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_pipe); + + if (array == e_RTE_SCHED_PORT_ARRAY_QUEUE) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_queue); + + if (array == e_RTE_SCHED_PORT_ARRAY_QUEUE_EXTRA) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_queue_extra); + + if (array == e_RTE_SCHED_PORT_ARRAY_PIPE_PROFILES) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_pipe_profiles); + + if (array == e_RTE_SCHED_PORT_ARRAY_BMP_ARRAY) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_bmp_array); + + if (array == e_RTE_SCHED_PORT_ARRAY_QUEUE_ARRAY) + return base; + base += RTE_CACHE_LINE_ROUNDUP(size_queue_array); + + return base; +} + +uint32_t +rte_sched_port_get_memory_footprint(struct rte_sched_port_params *params) +{ + uint32_t size0, size1; + int status; + + status = rte_sched_port_check_params(params); + if (status != 0) { + RTE_LOG(NOTICE, SCHED, + "Port scheduler params check failed (%d)\n", status); + + return 0; + } + + size0 = sizeof(struct rte_sched_port); + size1 = rte_sched_port_get_array_base(params, e_RTE_SCHED_PORT_ARRAY_TOTAL); + + return size0 + size1; +} + +static void +rte_sched_port_config_qsize(struct rte_sched_port *port) +{ + /* TC 0 */ + port->qsize_add[0] = 0; + port->qsize_add[1] = port->qsize_add[0] + port->qsize[0]; + port->qsize_add[2] = port->qsize_add[1] + port->qsize[0]; + port->qsize_add[3] = port->qsize_add[2] + port->qsize[0]; + + /* TC 1 */ + port->qsize_add[4] = port->qsize_add[3] + port->qsize[0]; + port->qsize_add[5] = port->qsize_add[4] + port->qsize[1]; + port->qsize_add[6] = port->qsize_add[5] + port->qsize[1]; + port->qsize_add[7] = port->qsize_add[6] + port->qsize[1]; + + /* TC 2 */ + port->qsize_add[8] = port->qsize_add[7] + port->qsize[1]; + port->qsize_add[9] = port->qsize_add[8] + port->qsize[2]; + port->qsize_add[10] = port->qsize_add[9] + port->qsize[2]; + port->qsize_add[11] = port->qsize_add[10] + port->qsize[2]; + + /* TC 3 */ + port->qsize_add[12] = port->qsize_add[11] + port->qsize[2]; + port->qsize_add[13] = port->qsize_add[12] + port->qsize[3]; + port->qsize_add[14] = port->qsize_add[13] + port->qsize[3]; + port->qsize_add[15] = port->qsize_add[14] + port->qsize[3]; + + port->qsize_sum = port->qsize_add[15] + port->qsize[3]; +} + +static void +rte_sched_port_log_pipe_profile(struct rte_sched_port *port, uint32_t i) +{ + struct rte_sched_pipe_profile *p = port->pipe_profiles + i; + + RTE_LOG(DEBUG, SCHED, "Low level config for pipe profile %u:\n" + " Token bucket: period = %u, credits per period = %u, size = %u\n" + " Traffic classes: period = %u, credits per period = [%u, %u, %u, %u]\n" + " Traffic class 3 oversubscription: weight = %hhu\n" + " WRR cost: [%hhu, %hhu, %hhu, %hhu], [%hhu, %hhu, %hhu, %hhu], [%hhu, %hhu, %hhu, %hhu], [%hhu, %hhu, %hhu, %hhu]\n", + i, + + /* Token bucket */ + p->tb_period, + p->tb_credits_per_period, + p->tb_size, + + /* Traffic classes */ + p->tc_period, + p->tc_credits_per_period[0], + p->tc_credits_per_period[1], + p->tc_credits_per_period[2], + p->tc_credits_per_period[3], + + /* Traffic class 3 oversubscription */ + p->tc_ov_weight, + + /* WRR */ + p->wrr_cost[ 0], p->wrr_cost[ 1], p->wrr_cost[ 2], p->wrr_cost[ 3], + p->wrr_cost[ 4], p->wrr_cost[ 5], p->wrr_cost[ 6], p->wrr_cost[ 7], + p->wrr_cost[ 8], p->wrr_cost[ 9], p->wrr_cost[10], p->wrr_cost[11], + p->wrr_cost[12], p->wrr_cost[13], p->wrr_cost[14], p->wrr_cost[15]); +} + +static inline uint64_t +rte_sched_time_ms_to_bytes(uint32_t time_ms, uint32_t rate) +{ + uint64_t time = time_ms; + + time = (time * rate) / 1000; + + return time; +} + +static void +rte_sched_pipe_profile_convert(struct rte_sched_pipe_params *src, + struct rte_sched_pipe_profile *dst, + uint32_t rate) +{ + uint32_t i; + + /* Token Bucket */ + if (src->tb_rate == rate) { + dst->tb_credits_per_period = 1; + dst->tb_period = 1; + } else { + double tb_rate = (double) src->tb_rate + / (double) rate; + double d = RTE_SCHED_TB_RATE_CONFIG_ERR; + + rte_approx(tb_rate, d, + &dst->tb_credits_per_period, &dst->tb_period); + } + + dst->tb_size = src->tb_size; + + /* Traffic Classes */ + dst->tc_period = rte_sched_time_ms_to_bytes(src->tc_period, + rate); + + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + dst->tc_credits_per_period[i] + = rte_sched_time_ms_to_bytes(src->tc_period, + src->tc_rate[i]); + +#ifdef RTE_SCHED_SUBPORT_TC_OV + dst->tc_ov_weight = src->tc_ov_weight; +#endif + + /* WRR */ + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + uint32_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; + uint32_t lcd, lcd1, lcd2; + uint32_t qindex; + + qindex = i * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS; + + wrr_cost[0] = src->wrr_weights[qindex]; + wrr_cost[1] = src->wrr_weights[qindex + 1]; + wrr_cost[2] = src->wrr_weights[qindex + 2]; + wrr_cost[3] = src->wrr_weights[qindex + 3]; + + lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]); + lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]); + lcd = rte_get_lcd(lcd1, lcd2); + + wrr_cost[0] = lcd / wrr_cost[0]; + wrr_cost[1] = lcd / wrr_cost[1]; + wrr_cost[2] = lcd / wrr_cost[2]; + wrr_cost[3] = lcd / wrr_cost[3]; + + dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0]; + dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1]; + dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2]; + dst->wrr_cost[qindex + 3] = (uint8_t) wrr_cost[3]; + } +} + +static void +rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, + struct rte_sched_port_params *params) +{ + uint32_t i; + + for (i = 0; i < port->n_pipe_profiles; i++) { + struct rte_sched_pipe_params *src = params->pipe_profiles + i; + struct rte_sched_pipe_profile *dst = port->pipe_profiles + i; + + rte_sched_pipe_profile_convert(src, dst, params->rate); + rte_sched_port_log_pipe_profile(port, i); + } + + port->pipe_tc3_rate_max = 0; + for (i = 0; i < port->n_pipe_profiles; i++) { + struct rte_sched_pipe_params *src = params->pipe_profiles + i; + uint32_t pipe_tc3_rate = src->tc_rate[3]; + + if (port->pipe_tc3_rate_max < pipe_tc3_rate) + port->pipe_tc3_rate_max = pipe_tc3_rate; + } +} + +struct rte_sched_port * +rte_sched_port_config(struct rte_sched_port_params *params) +{ + struct rte_sched_port *port = NULL; + uint32_t mem_size, bmp_mem_size, n_queues_per_port, i, cycles_per_byte; + + /* Check user parameters. Determine the amount of memory to allocate */ + mem_size = rte_sched_port_get_memory_footprint(params); + if (mem_size == 0) + return NULL; + + /* Allocate memory to store the data structures */ + port = rte_zmalloc("qos_params", mem_size, RTE_CACHE_LINE_SIZE); + if (port == NULL) + return NULL; + + /* compile time checks */ + RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS == 0); + RTE_BUILD_BUG_ON(RTE_SCHED_PORT_N_GRINDERS & (RTE_SCHED_PORT_N_GRINDERS - 1)); + + /* User parameters */ + port->n_subports_per_port = params->n_subports_per_port; + port->n_pipes_per_subport = params->n_pipes_per_subport; + port->rate = params->rate; + port->mtu = params->mtu + params->frame_overhead; + port->frame_overhead = params->frame_overhead; + memcpy(port->qsize, params->qsize, sizeof(params->qsize)); + port->n_pipe_profiles = params->n_pipe_profiles; + +#ifdef RTE_SCHED_RED + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + uint32_t j; + + for (j = 0; j < e_RTE_METER_COLORS; j++) { + /* if min/max are both zero, then RED is disabled */ + if ((params->red_params[i][j].min_th | + params->red_params[i][j].max_th) == 0) { + continue; + } + + if (rte_red_config_init(&port->red_config[i][j], + params->red_params[i][j].wq_log2, + params->red_params[i][j].min_th, + params->red_params[i][j].max_th, + params->red_params[i][j].maxp_inv) != 0) { + return NULL; + } + } + } +#endif + + /* Timing */ + port->time_cpu_cycles = rte_get_tsc_cycles(); + port->time_cpu_bytes = 0; + port->time = 0; + + cycles_per_byte = (rte_get_tsc_hz() << RTE_SCHED_TIME_SHIFT) + / params->rate; + port->inv_cycles_per_byte = rte_reciprocal_value(cycles_per_byte); + + /* Scheduling loop detection */ + port->pipe_loop = RTE_SCHED_PIPE_INVALID; + port->pipe_exhaustion = 0; + + /* Grinders */ + port->busy_grinders = 0; + port->pkts_out = NULL; + port->n_pkts_out = 0; + + /* Queue base calculation */ + rte_sched_port_config_qsize(port); + + /* Large data structures */ + port->subport = (struct rte_sched_subport *) + (port->memory + rte_sched_port_get_array_base(params, + e_RTE_SCHED_PORT_ARRAY_SUBPORT)); + port->pipe = (struct rte_sched_pipe *) + (port->memory + rte_sched_port_get_array_base(params, + e_RTE_SCHED_PORT_ARRAY_PIPE)); + port->queue = (struct rte_sched_queue *) + (port->memory + rte_sched_port_get_array_base(params, + e_RTE_SCHED_PORT_ARRAY_QUEUE)); + port->queue_extra = (struct rte_sched_queue_extra *) + (port->memory + rte_sched_port_get_array_base(params, + e_RTE_SCHED_PORT_ARRAY_QUEUE_EXTRA)); + port->pipe_profiles = (struct rte_sched_pipe_profile *) + (port->memory + rte_sched_port_get_array_base(params, + e_RTE_SCHED_PORT_ARRAY_PIPE_PROFILES)); + port->bmp_array = port->memory + + rte_sched_port_get_array_base(params, e_RTE_SCHED_PORT_ARRAY_BMP_ARRAY); + port->queue_array = (struct rte_mbuf **) + (port->memory + rte_sched_port_get_array_base(params, + e_RTE_SCHED_PORT_ARRAY_QUEUE_ARRAY)); + + /* Pipe profile table */ + rte_sched_port_config_pipe_profile_table(port, params); + + /* Bitmap */ + n_queues_per_port = rte_sched_port_queues_per_port(port); + bmp_mem_size = rte_bitmap_get_memory_footprint(n_queues_per_port); + port->bmp = rte_bitmap_init(n_queues_per_port, port->bmp_array, + bmp_mem_size); + if (port->bmp == NULL) { + RTE_LOG(ERR, SCHED, "Bitmap init error\n"); + return NULL; + } + + for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++) + port->grinder_base_bmp_pos[i] = RTE_SCHED_PIPE_INVALID; + + + return port; +} + +void +rte_sched_port_free(struct rte_sched_port *port) +{ + uint32_t qindex; + uint32_t n_queues_per_port; + + /* Check user parameters */ + if (port == NULL) + return; + + n_queues_per_port = rte_sched_port_queues_per_port(port); + + /* Free enqueued mbufs */ + for (qindex = 0; qindex < n_queues_per_port; qindex++) { + struct rte_mbuf **mbufs = rte_sched_port_qbase(port, qindex); + uint16_t qsize = rte_sched_port_qsize(port, qindex); + struct rte_sched_queue *queue = port->queue + qindex; + uint16_t qr = queue->qr & (qsize - 1); + uint16_t qw = queue->qw & (qsize - 1); + + for (; qr != qw; qr = (qr + 1) & (qsize - 1)) + rte_pktmbuf_free(mbufs[qr]); + } + + rte_bitmap_free(port->bmp); + rte_free(port); +} + +static void +rte_sched_port_log_subport_config(struct rte_sched_port *port, uint32_t i) +{ + struct rte_sched_subport *s = port->subport + i; + + RTE_LOG(DEBUG, SCHED, "Low level config for subport %u:\n" + " Token bucket: period = %u, credits per period = %u, size = %u\n" + " Traffic classes: period = %u, credits per period = [%u, %u, %u, %u]\n" + " Traffic class 3 oversubscription: wm min = %u, wm max = %u\n", + i, + + /* Token bucket */ + s->tb_period, + s->tb_credits_per_period, + s->tb_size, + + /* Traffic classes */ + s->tc_period, + s->tc_credits_per_period[0], + s->tc_credits_per_period[1], + s->tc_credits_per_period[2], + s->tc_credits_per_period[3], + + /* Traffic class 3 oversubscription */ + s->tc_ov_wm_min, + s->tc_ov_wm_max); +} + +int +rte_sched_subport_config(struct rte_sched_port *port, + uint32_t subport_id, + struct rte_sched_subport_params *params) +{ + struct rte_sched_subport *s; + uint32_t i; + + /* Check user parameters */ + if (port == NULL || + subport_id >= port->n_subports_per_port || + params == NULL) + return -1; + + if (params->tb_rate == 0 || params->tb_rate > port->rate) + return -2; + + if (params->tb_size == 0) + return -3; + + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + if (params->tc_rate[i] == 0 || + params->tc_rate[i] > params->tb_rate) + return -4; + } + + if (params->tc_period == 0) + return -5; + + s = port->subport + subport_id; + + /* Token Bucket (TB) */ + if (params->tb_rate == port->rate) { + s->tb_credits_per_period = 1; + s->tb_period = 1; + } else { + double tb_rate = ((double) params->tb_rate) / ((double) port->rate); + double d = RTE_SCHED_TB_RATE_CONFIG_ERR; + + rte_approx(tb_rate, d, &s->tb_credits_per_period, &s->tb_period); + } + + s->tb_size = params->tb_size; + s->tb_time = port->time; + s->tb_credits = s->tb_size / 2; + + /* Traffic Classes (TCs) */ + s->tc_period = rte_sched_time_ms_to_bytes(params->tc_period, port->rate); + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { + s->tc_credits_per_period[i] + = rte_sched_time_ms_to_bytes(params->tc_period, + params->tc_rate[i]); + } + s->tc_time = port->time + s->tc_period; + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + s->tc_credits[i] = s->tc_credits_per_period[i]; + +#ifdef RTE_SCHED_SUBPORT_TC_OV + /* TC oversubscription */ + s->tc_ov_wm_min = port->mtu; + s->tc_ov_wm_max = rte_sched_time_ms_to_bytes(params->tc_period, + port->pipe_tc3_rate_max); + s->tc_ov_wm = s->tc_ov_wm_max; + s->tc_ov_period_id = 0; + s->tc_ov = 0; + s->tc_ov_n = 0; + s->tc_ov_rate = 0; +#endif + + rte_sched_port_log_subport_config(port, subport_id); + + return 0; +} + +int +rte_sched_pipe_config(struct rte_sched_port *port, + uint32_t subport_id, + uint32_t pipe_id, + int32_t pipe_profile) +{ + struct rte_sched_subport *s; + struct rte_sched_pipe *p; + struct rte_sched_pipe_profile *params; + uint32_t deactivate, profile, i; + + /* Check user parameters */ + profile = (uint32_t) pipe_profile; + deactivate = (pipe_profile < 0); + + if (port == NULL || + subport_id >= port->n_subports_per_port || + pipe_id >= port->n_pipes_per_subport || + (!deactivate && profile >= port->n_pipe_profiles)) + return -1; + + + /* Check that subport configuration is valid */ + s = port->subport + subport_id; + if (s->tb_period == 0) + return -2; + + p = port->pipe + (subport_id * port->n_pipes_per_subport + pipe_id); + + /* Handle the case when pipe already has a valid configuration */ + if (p->tb_time) { + params = port->pipe_profiles + p->profile; + +#ifdef RTE_SCHED_SUBPORT_TC_OV + double subport_tc3_rate = (double) s->tc_credits_per_period[3] + / (double) s->tc_period; + double pipe_tc3_rate = (double) params->tc_credits_per_period[3] + / (double) params->tc_period; + uint32_t tc3_ov = s->tc_ov; + + /* Unplug pipe from its subport */ + s->tc_ov_n -= params->tc_ov_weight; + s->tc_ov_rate -= pipe_tc3_rate; + s->tc_ov = s->tc_ov_rate > subport_tc3_rate; + + if (s->tc_ov != tc3_ov) { + RTE_LOG(DEBUG, SCHED, + "Subport %u TC3 oversubscription is OFF (%.4lf >= %.4lf)\n", + subport_id, subport_tc3_rate, s->tc_ov_rate); + } +#endif + + /* Reset the pipe */ + memset(p, 0, sizeof(struct rte_sched_pipe)); + } + + if (deactivate) + return 0; + + /* Apply the new pipe configuration */ + p->profile = profile; + params = port->pipe_profiles + p->profile; + + /* Token Bucket (TB) */ + p->tb_time = port->time; + p->tb_credits = params->tb_size / 2; + + /* Traffic Classes (TCs) */ + p->tc_time = port->time + params->tc_period; + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + p->tc_credits[i] = params->tc_credits_per_period[i]; + +#ifdef RTE_SCHED_SUBPORT_TC_OV + { + /* Subport TC3 oversubscription */ + double subport_tc3_rate = (double) s->tc_credits_per_period[3] + / (double) s->tc_period; + double pipe_tc3_rate = (double) params->tc_credits_per_period[3] + / (double) params->tc_period; + uint32_t tc3_ov = s->tc_ov; + + s->tc_ov_n += params->tc_ov_weight; + s->tc_ov_rate += pipe_tc3_rate; + s->tc_ov = s->tc_ov_rate > subport_tc3_rate; + + if (s->tc_ov != tc3_ov) { + RTE_LOG(DEBUG, SCHED, + "Subport %u TC3 oversubscription is ON (%.4lf < %.4lf)\n", + subport_id, subport_tc3_rate, s->tc_ov_rate); + } + p->tc_ov_period_id = s->tc_ov_period_id; + p->tc_ov_credits = s->tc_ov_wm; + } +#endif + + return 0; +} + +int __rte_experimental +rte_sched_port_pipe_profile_add(struct rte_sched_port *port, + struct rte_sched_pipe_params *params, + uint32_t *pipe_profile_id) +{ + struct rte_sched_pipe_profile *pp; + uint32_t i; + int status; + + /* Port */ + if (port == NULL) + return -1; + + /* Pipe profiles not exceeds the max limit */ + if (port->n_pipe_profiles >= RTE_SCHED_PIPE_PROFILES_PER_PORT) + return -2; + + /* Pipe params */ + status = pipe_profile_check(params, port->rate); + if (status != 0) + return status; + + pp = &port->pipe_profiles[port->n_pipe_profiles]; + rte_sched_pipe_profile_convert(params, pp, port->rate); + + /* Pipe profile not exists */ + for (i = 0; i < port->n_pipe_profiles; i++) + if (memcmp(port->pipe_profiles + i, pp, sizeof(*pp)) == 0) + return -3; + + /* Pipe profile commit */ + *pipe_profile_id = port->n_pipe_profiles; + port->n_pipe_profiles++; + + if (port->pipe_tc3_rate_max < params->tc_rate[3]) + port->pipe_tc3_rate_max = params->tc_rate[3]; + + rte_sched_port_log_pipe_profile(port, *pipe_profile_id); + + return 0; +} + +void +rte_sched_port_pkt_write(struct rte_mbuf *pkt, + uint32_t subport, uint32_t pipe, uint32_t traffic_class, + uint32_t queue, enum rte_meter_color color) +{ + struct rte_sched_port_hierarchy *sched + = (struct rte_sched_port_hierarchy *) &pkt->hash.sched; + + RTE_BUILD_BUG_ON(sizeof(*sched) > sizeof(pkt->hash.sched)); + + sched->color = (uint32_t) color; + sched->subport = subport; + sched->pipe = pipe; + sched->traffic_class = traffic_class; + sched->queue = queue; +} + +void +rte_sched_port_pkt_read_tree_path(const struct rte_mbuf *pkt, + uint32_t *subport, uint32_t *pipe, + uint32_t *traffic_class, uint32_t *queue) +{ + const struct rte_sched_port_hierarchy *sched + = (const struct rte_sched_port_hierarchy *) &pkt->hash.sched; + + *subport = sched->subport; + *pipe = sched->pipe; + *traffic_class = sched->traffic_class; + *queue = sched->queue; +} + +enum rte_meter_color +rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt) +{ + const struct rte_sched_port_hierarchy *sched + = (const struct rte_sched_port_hierarchy *) &pkt->hash.sched; + + return (enum rte_meter_color) sched->color; +} + +int +rte_sched_subport_read_stats(struct rte_sched_port *port, + uint32_t subport_id, + struct rte_sched_subport_stats *stats, + uint32_t *tc_ov) +{ + struct rte_sched_subport *s; + + /* Check user parameters */ + if (port == NULL || subport_id >= port->n_subports_per_port || + stats == NULL || tc_ov == NULL) + return -1; + + s = port->subport + subport_id; + + /* Copy subport stats and clear */ + memcpy(stats, &s->stats, sizeof(struct rte_sched_subport_stats)); + memset(&s->stats, 0, sizeof(struct rte_sched_subport_stats)); + + /* Subport TC oversubscription status */ + *tc_ov = s->tc_ov; + + return 0; +} + +int +rte_sched_queue_read_stats(struct rte_sched_port *port, + uint32_t queue_id, + struct rte_sched_queue_stats *stats, + uint16_t *qlen) +{ + struct rte_sched_queue *q; + struct rte_sched_queue_extra *qe; + + /* Check user parameters */ + if ((port == NULL) || + (queue_id >= rte_sched_port_queues_per_port(port)) || + (stats == NULL) || + (qlen == NULL)) { + return -1; + } + q = port->queue + queue_id; + qe = port->queue_extra + queue_id; + + /* Copy queue stats and clear */ + memcpy(stats, &qe->stats, sizeof(struct rte_sched_queue_stats)); + memset(&qe->stats, 0, sizeof(struct rte_sched_queue_stats)); + + /* Queue length */ + *qlen = q->qw - q->qr; + + return 0; +} + +static inline uint32_t +rte_sched_port_qindex(struct rte_sched_port *port, uint32_t subport, uint32_t pipe, uint32_t traffic_class, uint32_t queue) +{ + uint32_t result; + + result = subport * port->n_pipes_per_subport + pipe; + result = result * RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE + traffic_class; + result = result * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; + + return result; +} + +#ifdef RTE_SCHED_DEBUG + +static inline int +rte_sched_port_queue_is_empty(struct rte_sched_port *port, uint32_t qindex) +{ + struct rte_sched_queue *queue = port->queue + qindex; + + return queue->qr == queue->qw; +} + +#endif /* RTE_SCHED_DEBUG */ + +#ifdef RTE_SCHED_COLLECT_STATS + +static inline void +rte_sched_port_update_subport_stats(struct rte_sched_port *port, uint32_t qindex, struct rte_mbuf *pkt) +{ + struct rte_sched_subport *s = port->subport + (qindex / rte_sched_port_queues_per_subport(port)); + uint32_t tc_index = (qindex >> 2) & 0x3; + uint32_t pkt_len = pkt->pkt_len; + + s->stats.n_pkts_tc[tc_index] += 1; + s->stats.n_bytes_tc[tc_index] += pkt_len; +} + +#ifdef RTE_SCHED_RED +static inline void +rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port, + uint32_t qindex, + struct rte_mbuf *pkt, uint32_t red) +#else +static inline void +rte_sched_port_update_subport_stats_on_drop(struct rte_sched_port *port, + uint32_t qindex, + struct rte_mbuf *pkt, __rte_unused uint32_t red) +#endif +{ + struct rte_sched_subport *s = port->subport + (qindex / rte_sched_port_queues_per_subport(port)); + uint32_t tc_index = (qindex >> 2) & 0x3; + uint32_t pkt_len = pkt->pkt_len; + + s->stats.n_pkts_tc_dropped[tc_index] += 1; + s->stats.n_bytes_tc_dropped[tc_index] += pkt_len; +#ifdef RTE_SCHED_RED + s->stats.n_pkts_red_dropped[tc_index] += red; +#endif +} + +static inline void +rte_sched_port_update_queue_stats(struct rte_sched_port *port, uint32_t qindex, struct rte_mbuf *pkt) +{ + struct rte_sched_queue_extra *qe = port->queue_extra + qindex; + uint32_t pkt_len = pkt->pkt_len; + + qe->stats.n_pkts += 1; + qe->stats.n_bytes += pkt_len; +} + +#ifdef RTE_SCHED_RED +static inline void +rte_sched_port_update_queue_stats_on_drop(struct rte_sched_port *port, + uint32_t qindex, + struct rte_mbuf *pkt, uint32_t red) +#else +static inline void +rte_sched_port_update_queue_stats_on_drop(struct rte_sched_port *port, + uint32_t qindex, + struct rte_mbuf *pkt, __rte_unused uint32_t red) +#endif +{ + struct rte_sched_queue_extra *qe = port->queue_extra + qindex; + uint32_t pkt_len = pkt->pkt_len; + + qe->stats.n_pkts_dropped += 1; + qe->stats.n_bytes_dropped += pkt_len; +#ifdef RTE_SCHED_RED + qe->stats.n_pkts_red_dropped += red; +#endif +} + +#endif /* RTE_SCHED_COLLECT_STATS */ + +#ifdef RTE_SCHED_RED + +static inline int +rte_sched_port_red_drop(struct rte_sched_port *port, struct rte_mbuf *pkt, uint32_t qindex, uint16_t qlen) +{ + struct rte_sched_queue_extra *qe; + struct rte_red_config *red_cfg; + struct rte_red *red; + uint32_t tc_index; + enum rte_meter_color color; + + tc_index = (qindex >> 2) & 0x3; + color = rte_sched_port_pkt_read_color(pkt); + red_cfg = &port->red_config[tc_index][color]; + + if ((red_cfg->min_th | red_cfg->max_th) == 0) + return 0; + + qe = port->queue_extra + qindex; + red = &qe->red; + + return rte_red_enqueue(red_cfg, red, qlen, port->time); +} + +static inline void +rte_sched_port_set_queue_empty_timestamp(struct rte_sched_port *port, uint32_t qindex) +{ + struct rte_sched_queue_extra *qe = port->queue_extra + qindex; + struct rte_red *red = &qe->red; + + rte_red_mark_queue_empty(red, port->time); +} + +#else + +#define rte_sched_port_red_drop(port, pkt, qindex, qlen) 0 + +#define rte_sched_port_set_queue_empty_timestamp(port, qindex) + +#endif /* RTE_SCHED_RED */ + +#ifdef RTE_SCHED_DEBUG + +static inline void +debug_check_queue_slab(struct rte_sched_port *port, uint32_t bmp_pos, + uint64_t bmp_slab) +{ + uint64_t mask; + uint32_t i, panic; + + if (bmp_slab == 0) + rte_panic("Empty slab at position %u\n", bmp_pos); + + panic = 0; + for (i = 0, mask = 1; i < 64; i++, mask <<= 1) { + if (mask & bmp_slab) { + if (rte_sched_port_queue_is_empty(port, bmp_pos + i)) { + printf("Queue %u (slab offset %u) is empty\n", bmp_pos + i, i); + panic = 1; + } + } + } + + if (panic) + rte_panic("Empty queues in slab 0x%" PRIx64 "starting at position %u\n", + bmp_slab, bmp_pos); +} + +#endif /* RTE_SCHED_DEBUG */ + +static inline uint32_t +rte_sched_port_enqueue_qptrs_prefetch0(struct rte_sched_port *port, + struct rte_mbuf *pkt) +{ + struct rte_sched_queue *q; +#ifdef RTE_SCHED_COLLECT_STATS + struct rte_sched_queue_extra *qe; +#endif + uint32_t subport, pipe, traffic_class, queue, qindex; + + rte_sched_port_pkt_read_tree_path(pkt, &subport, &pipe, &traffic_class, &queue); + + qindex = rte_sched_port_qindex(port, subport, pipe, traffic_class, queue); + q = port->queue + qindex; + rte_prefetch0(q); +#ifdef RTE_SCHED_COLLECT_STATS + qe = port->queue_extra + qindex; + rte_prefetch0(qe); +#endif + + return qindex; +} + +static inline void +rte_sched_port_enqueue_qwa_prefetch0(struct rte_sched_port *port, + uint32_t qindex, struct rte_mbuf **qbase) +{ + struct rte_sched_queue *q; + struct rte_mbuf **q_qw; + uint16_t qsize; + + q = port->queue + qindex; + qsize = rte_sched_port_qsize(port, qindex); + q_qw = qbase + (q->qw & (qsize - 1)); + + rte_prefetch0(q_qw); + rte_bitmap_prefetch0(port->bmp, qindex); +} + +static inline int +rte_sched_port_enqueue_qwa(struct rte_sched_port *port, uint32_t qindex, + struct rte_mbuf **qbase, struct rte_mbuf *pkt) +{ + struct rte_sched_queue *q; + uint16_t qsize; + uint16_t qlen; + + q = port->queue + qindex; + qsize = rte_sched_port_qsize(port, qindex); + qlen = q->qw - q->qr; + + /* Drop the packet (and update drop stats) when queue is full */ + if (unlikely(rte_sched_port_red_drop(port, pkt, qindex, qlen) || + (qlen >= qsize))) { + rte_pktmbuf_free(pkt); +#ifdef RTE_SCHED_COLLECT_STATS + rte_sched_port_update_subport_stats_on_drop(port, qindex, pkt, + qlen < qsize); + rte_sched_port_update_queue_stats_on_drop(port, qindex, pkt, + qlen < qsize); +#endif + return 0; + } + + /* Enqueue packet */ + qbase[q->qw & (qsize - 1)] = pkt; + q->qw++; + + /* Activate queue in the port bitmap */ + rte_bitmap_set(port->bmp, qindex); + + /* Statistics */ +#ifdef RTE_SCHED_COLLECT_STATS + rte_sched_port_update_subport_stats(port, qindex, pkt); + rte_sched_port_update_queue_stats(port, qindex, pkt); +#endif + + return 1; +} + + +/* + * The enqueue function implements a 4-level pipeline with each stage + * processing two different packets. The purpose of using a pipeline + * is to hide the latency of prefetching the data structures. The + * naming convention is presented in the diagram below: + * + * p00 _______ p10 _______ p20 _______ p30 _______ + * ----->| |----->| |----->| |----->| |-----> + * | 0 | | 1 | | 2 | | 3 | + * ----->|_______|----->|_______|----->|_______|----->|_______|-----> + * p01 p11 p21 p31 + * + */ +int +rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts, + uint32_t n_pkts) +{ + struct rte_mbuf *pkt00, *pkt01, *pkt10, *pkt11, *pkt20, *pkt21, + *pkt30, *pkt31, *pkt_last; + struct rte_mbuf **q00_base, **q01_base, **q10_base, **q11_base, + **q20_base, **q21_base, **q30_base, **q31_base, **q_last_base; + uint32_t q00, q01, q10, q11, q20, q21, q30, q31, q_last; + uint32_t r00, r01, r10, r11, r20, r21, r30, r31, r_last; + uint32_t result, i; + + result = 0; + + /* + * Less then 6 input packets available, which is not enough to + * feed the pipeline + */ + if (unlikely(n_pkts < 6)) { + struct rte_mbuf **q_base[5]; + uint32_t q[5]; + + /* Prefetch the mbuf structure of each packet */ + for (i = 0; i < n_pkts; i++) + rte_prefetch0(pkts[i]); + + /* Prefetch the queue structure for each queue */ + for (i = 0; i < n_pkts; i++) + q[i] = rte_sched_port_enqueue_qptrs_prefetch0(port, + pkts[i]); + + /* Prefetch the write pointer location of each queue */ + for (i = 0; i < n_pkts; i++) { + q_base[i] = rte_sched_port_qbase(port, q[i]); + rte_sched_port_enqueue_qwa_prefetch0(port, q[i], + q_base[i]); + } + + /* Write each packet to its queue */ + for (i = 0; i < n_pkts; i++) + result += rte_sched_port_enqueue_qwa(port, q[i], + q_base[i], pkts[i]); + + return result; + } + + /* Feed the first 3 stages of the pipeline (6 packets needed) */ + pkt20 = pkts[0]; + pkt21 = pkts[1]; + rte_prefetch0(pkt20); + rte_prefetch0(pkt21); + + pkt10 = pkts[2]; + pkt11 = pkts[3]; + rte_prefetch0(pkt10); + rte_prefetch0(pkt11); + + q20 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt20); + q21 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt21); + + pkt00 = pkts[4]; + pkt01 = pkts[5]; + rte_prefetch0(pkt00); + rte_prefetch0(pkt01); + + q10 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt10); + q11 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt11); + + q20_base = rte_sched_port_qbase(port, q20); + q21_base = rte_sched_port_qbase(port, q21); + rte_sched_port_enqueue_qwa_prefetch0(port, q20, q20_base); + rte_sched_port_enqueue_qwa_prefetch0(port, q21, q21_base); + + /* Run the pipeline */ + for (i = 6; i < (n_pkts & (~1)); i += 2) { + /* Propagate stage inputs */ + pkt30 = pkt20; + pkt31 = pkt21; + pkt20 = pkt10; + pkt21 = pkt11; + pkt10 = pkt00; + pkt11 = pkt01; + q30 = q20; + q31 = q21; + q20 = q10; + q21 = q11; + q30_base = q20_base; + q31_base = q21_base; + + /* Stage 0: Get packets in */ + pkt00 = pkts[i]; + pkt01 = pkts[i + 1]; + rte_prefetch0(pkt00); + rte_prefetch0(pkt01); + + /* Stage 1: Prefetch queue structure storing queue pointers */ + q10 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt10); + q11 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt11); + + /* Stage 2: Prefetch queue write location */ + q20_base = rte_sched_port_qbase(port, q20); + q21_base = rte_sched_port_qbase(port, q21); + rte_sched_port_enqueue_qwa_prefetch0(port, q20, q20_base); + rte_sched_port_enqueue_qwa_prefetch0(port, q21, q21_base); + + /* Stage 3: Write packet to queue and activate queue */ + r30 = rte_sched_port_enqueue_qwa(port, q30, q30_base, pkt30); + r31 = rte_sched_port_enqueue_qwa(port, q31, q31_base, pkt31); + result += r30 + r31; + } + + /* + * Drain the pipeline (exactly 6 packets). + * Handle the last packet in the case + * of an odd number of input packets. + */ + pkt_last = pkts[n_pkts - 1]; + rte_prefetch0(pkt_last); + + q00 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt00); + q01 = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt01); + + q10_base = rte_sched_port_qbase(port, q10); + q11_base = rte_sched_port_qbase(port, q11); + rte_sched_port_enqueue_qwa_prefetch0(port, q10, q10_base); + rte_sched_port_enqueue_qwa_prefetch0(port, q11, q11_base); + + r20 = rte_sched_port_enqueue_qwa(port, q20, q20_base, pkt20); + r21 = rte_sched_port_enqueue_qwa(port, q21, q21_base, pkt21); + result += r20 + r21; + + q_last = rte_sched_port_enqueue_qptrs_prefetch0(port, pkt_last); + + q00_base = rte_sched_port_qbase(port, q00); + q01_base = rte_sched_port_qbase(port, q01); + rte_sched_port_enqueue_qwa_prefetch0(port, q00, q00_base); + rte_sched_port_enqueue_qwa_prefetch0(port, q01, q01_base); + + r10 = rte_sched_port_enqueue_qwa(port, q10, q10_base, pkt10); + r11 = rte_sched_port_enqueue_qwa(port, q11, q11_base, pkt11); + result += r10 + r11; + + q_last_base = rte_sched_port_qbase(port, q_last); + rte_sched_port_enqueue_qwa_prefetch0(port, q_last, q_last_base); + + r00 = rte_sched_port_enqueue_qwa(port, q00, q00_base, pkt00); + r01 = rte_sched_port_enqueue_qwa(port, q01, q01_base, pkt01); + result += r00 + r01; + + if (n_pkts & 1) { + r_last = rte_sched_port_enqueue_qwa(port, q_last, q_last_base, pkt_last); + result += r_last; + } + + return result; +} + +#ifndef RTE_SCHED_SUBPORT_TC_OV + +static inline void +grinder_credits_update(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_subport *subport = grinder->subport; + struct rte_sched_pipe *pipe = grinder->pipe; + struct rte_sched_pipe_profile *params = grinder->pipe_params; + uint64_t n_periods; + + /* Subport TB */ + n_periods = (port->time - subport->tb_time) / subport->tb_period; + subport->tb_credits += n_periods * subport->tb_credits_per_period; + subport->tb_credits = rte_sched_min_val_2_u32(subport->tb_credits, subport->tb_size); + subport->tb_time += n_periods * subport->tb_period; + + /* Pipe TB */ + n_periods = (port->time - pipe->tb_time) / params->tb_period; + pipe->tb_credits += n_periods * params->tb_credits_per_period; + pipe->tb_credits = rte_sched_min_val_2_u32(pipe->tb_credits, params->tb_size); + pipe->tb_time += n_periods * params->tb_period; + + /* Subport TCs */ + if (unlikely(port->time >= subport->tc_time)) { + subport->tc_credits[0] = subport->tc_credits_per_period[0]; + subport->tc_credits[1] = subport->tc_credits_per_period[1]; + subport->tc_credits[2] = subport->tc_credits_per_period[2]; + subport->tc_credits[3] = subport->tc_credits_per_period[3]; + subport->tc_time = port->time + subport->tc_period; + } + + /* Pipe TCs */ + if (unlikely(port->time >= pipe->tc_time)) { + pipe->tc_credits[0] = params->tc_credits_per_period[0]; + pipe->tc_credits[1] = params->tc_credits_per_period[1]; + pipe->tc_credits[2] = params->tc_credits_per_period[2]; + pipe->tc_credits[3] = params->tc_credits_per_period[3]; + pipe->tc_time = port->time + params->tc_period; + } +} + +#else + +static inline uint32_t +grinder_tc_ov_credits_update(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_subport *subport = grinder->subport; + uint32_t tc_ov_consumption[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + uint32_t tc_ov_consumption_max; + uint32_t tc_ov_wm = subport->tc_ov_wm; + + if (subport->tc_ov == 0) + return subport->tc_ov_wm_max; + + tc_ov_consumption[0] = subport->tc_credits_per_period[0] - subport->tc_credits[0]; + tc_ov_consumption[1] = subport->tc_credits_per_period[1] - subport->tc_credits[1]; + tc_ov_consumption[2] = subport->tc_credits_per_period[2] - subport->tc_credits[2]; + tc_ov_consumption[3] = subport->tc_credits_per_period[3] - subport->tc_credits[3]; + + tc_ov_consumption_max = subport->tc_credits_per_period[3] - + (tc_ov_consumption[0] + tc_ov_consumption[1] + tc_ov_consumption[2]); + + if (tc_ov_consumption[3] > (tc_ov_consumption_max - port->mtu)) { + tc_ov_wm -= tc_ov_wm >> 7; + if (tc_ov_wm < subport->tc_ov_wm_min) + tc_ov_wm = subport->tc_ov_wm_min; + + return tc_ov_wm; + } + + tc_ov_wm += (tc_ov_wm >> 7) + 1; + if (tc_ov_wm > subport->tc_ov_wm_max) + tc_ov_wm = subport->tc_ov_wm_max; + + return tc_ov_wm; +} + +static inline void +grinder_credits_update(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_subport *subport = grinder->subport; + struct rte_sched_pipe *pipe = grinder->pipe; + struct rte_sched_pipe_profile *params = grinder->pipe_params; + uint64_t n_periods; + + /* Subport TB */ + n_periods = (port->time - subport->tb_time) / subport->tb_period; + subport->tb_credits += n_periods * subport->tb_credits_per_period; + subport->tb_credits = rte_sched_min_val_2_u32(subport->tb_credits, subport->tb_size); + subport->tb_time += n_periods * subport->tb_period; + + /* Pipe TB */ + n_periods = (port->time - pipe->tb_time) / params->tb_period; + pipe->tb_credits += n_periods * params->tb_credits_per_period; + pipe->tb_credits = rte_sched_min_val_2_u32(pipe->tb_credits, params->tb_size); + pipe->tb_time += n_periods * params->tb_period; + + /* Subport TCs */ + if (unlikely(port->time >= subport->tc_time)) { + subport->tc_ov_wm = grinder_tc_ov_credits_update(port, pos); + + subport->tc_credits[0] = subport->tc_credits_per_period[0]; + subport->tc_credits[1] = subport->tc_credits_per_period[1]; + subport->tc_credits[2] = subport->tc_credits_per_period[2]; + subport->tc_credits[3] = subport->tc_credits_per_period[3]; + + subport->tc_time = port->time + subport->tc_period; + subport->tc_ov_period_id++; + } + + /* Pipe TCs */ + if (unlikely(port->time >= pipe->tc_time)) { + pipe->tc_credits[0] = params->tc_credits_per_period[0]; + pipe->tc_credits[1] = params->tc_credits_per_period[1]; + pipe->tc_credits[2] = params->tc_credits_per_period[2]; + pipe->tc_credits[3] = params->tc_credits_per_period[3]; + pipe->tc_time = port->time + params->tc_period; + } + + /* Pipe TCs - Oversubscription */ + if (unlikely(pipe->tc_ov_period_id != subport->tc_ov_period_id)) { + pipe->tc_ov_credits = subport->tc_ov_wm * params->tc_ov_weight; + + pipe->tc_ov_period_id = subport->tc_ov_period_id; + } +} + +#endif /* RTE_SCHED_TS_CREDITS_UPDATE, RTE_SCHED_SUBPORT_TC_OV */ + + +#ifndef RTE_SCHED_SUBPORT_TC_OV + +static inline int +grinder_credits_check(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_subport *subport = grinder->subport; + struct rte_sched_pipe *pipe = grinder->pipe; + struct rte_mbuf *pkt = grinder->pkt; + uint32_t tc_index = grinder->tc_index; + uint32_t pkt_len = pkt->pkt_len + port->frame_overhead; + uint32_t subport_tb_credits = subport->tb_credits; + uint32_t subport_tc_credits = subport->tc_credits[tc_index]; + uint32_t pipe_tb_credits = pipe->tb_credits; + uint32_t pipe_tc_credits = pipe->tc_credits[tc_index]; + int enough_credits; + + /* Check queue credits */ + enough_credits = (pkt_len <= subport_tb_credits) && + (pkt_len <= subport_tc_credits) && + (pkt_len <= pipe_tb_credits) && + (pkt_len <= pipe_tc_credits); + + if (!enough_credits) + return 0; + + /* Update port credits */ + subport->tb_credits -= pkt_len; + subport->tc_credits[tc_index] -= pkt_len; + pipe->tb_credits -= pkt_len; + pipe->tc_credits[tc_index] -= pkt_len; + + return 1; +} + +#else + +static inline int +grinder_credits_check(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_subport *subport = grinder->subport; + struct rte_sched_pipe *pipe = grinder->pipe; + struct rte_mbuf *pkt = grinder->pkt; + uint32_t tc_index = grinder->tc_index; + uint32_t pkt_len = pkt->pkt_len + port->frame_overhead; + uint32_t subport_tb_credits = subport->tb_credits; + uint32_t subport_tc_credits = subport->tc_credits[tc_index]; + uint32_t pipe_tb_credits = pipe->tb_credits; + uint32_t pipe_tc_credits = pipe->tc_credits[tc_index]; + uint32_t pipe_tc_ov_mask1[] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, pipe->tc_ov_credits}; + uint32_t pipe_tc_ov_mask2[] = {0, 0, 0, UINT32_MAX}; + uint32_t pipe_tc_ov_credits = pipe_tc_ov_mask1[tc_index]; + int enough_credits; + + /* Check pipe and subport credits */ + enough_credits = (pkt_len <= subport_tb_credits) && + (pkt_len <= subport_tc_credits) && + (pkt_len <= pipe_tb_credits) && + (pkt_len <= pipe_tc_credits) && + (pkt_len <= pipe_tc_ov_credits); + + if (!enough_credits) + return 0; + + /* Update pipe and subport credits */ + subport->tb_credits -= pkt_len; + subport->tc_credits[tc_index] -= pkt_len; + pipe->tb_credits -= pkt_len; + pipe->tc_credits[tc_index] -= pkt_len; + pipe->tc_ov_credits -= pipe_tc_ov_mask2[tc_index] & pkt_len; + + return 1; +} + +#endif /* RTE_SCHED_SUBPORT_TC_OV */ + + +static inline int +grinder_schedule(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_queue *queue = grinder->queue[grinder->qpos]; + struct rte_mbuf *pkt = grinder->pkt; + uint32_t pkt_len = pkt->pkt_len + port->frame_overhead; + + if (!grinder_credits_check(port, pos)) + return 0; + + /* Advance port time */ + port->time += pkt_len; + + /* Send packet */ + port->pkts_out[port->n_pkts_out++] = pkt; + queue->qr++; + grinder->wrr_tokens[grinder->qpos] += pkt_len * grinder->wrr_cost[grinder->qpos]; + if (queue->qr == queue->qw) { + uint32_t qindex = grinder->qindex[grinder->qpos]; + + rte_bitmap_clear(port->bmp, qindex); + grinder->qmask &= ~(1 << grinder->qpos); + grinder->wrr_mask[grinder->qpos] = 0; + rte_sched_port_set_queue_empty_timestamp(port, qindex); + } + + /* Reset pipe loop detection */ + port->pipe_loop = RTE_SCHED_PIPE_INVALID; + grinder->productive = 1; + + return 1; +} + +#ifdef SCHED_VECTOR_SSE4 + +static inline int +grinder_pipe_exists(struct rte_sched_port *port, uint32_t base_pipe) +{ + __m128i index = _mm_set1_epi32(base_pipe); + __m128i pipes = _mm_load_si128((__m128i *)port->grinder_base_bmp_pos); + __m128i res = _mm_cmpeq_epi32(pipes, index); + + pipes = _mm_load_si128((__m128i *)(port->grinder_base_bmp_pos + 4)); + pipes = _mm_cmpeq_epi32(pipes, index); + res = _mm_or_si128(res, pipes); + + if (_mm_testz_si128(res, res)) + return 0; + + return 1; +} + +#elif defined(SCHED_VECTOR_NEON) + +static inline int +grinder_pipe_exists(struct rte_sched_port *port, uint32_t base_pipe) +{ + uint32x4_t index, pipes; + uint32_t *pos = (uint32_t *)port->grinder_base_bmp_pos; + + index = vmovq_n_u32(base_pipe); + pipes = vld1q_u32(pos); + if (!vminvq_u32(veorq_u32(pipes, index))) + return 1; + + pipes = vld1q_u32(pos + 4); + if (!vminvq_u32(veorq_u32(pipes, index))) + return 1; + + return 0; +} + +#else + +static inline int +grinder_pipe_exists(struct rte_sched_port *port, uint32_t base_pipe) +{ + uint32_t i; + + for (i = 0; i < RTE_SCHED_PORT_N_GRINDERS; i++) { + if (port->grinder_base_bmp_pos[i] == base_pipe) + return 1; + } + + return 0; +} + +#endif /* RTE_SCHED_OPTIMIZATIONS */ + +static inline void +grinder_pcache_populate(struct rte_sched_port *port, uint32_t pos, uint32_t bmp_pos, uint64_t bmp_slab) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + uint16_t w[4]; + + grinder->pcache_w = 0; + grinder->pcache_r = 0; + + w[0] = (uint16_t) bmp_slab; + w[1] = (uint16_t) (bmp_slab >> 16); + w[2] = (uint16_t) (bmp_slab >> 32); + w[3] = (uint16_t) (bmp_slab >> 48); + + grinder->pcache_qmask[grinder->pcache_w] = w[0]; + grinder->pcache_qindex[grinder->pcache_w] = bmp_pos; + grinder->pcache_w += (w[0] != 0); + + grinder->pcache_qmask[grinder->pcache_w] = w[1]; + grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 16; + grinder->pcache_w += (w[1] != 0); + + grinder->pcache_qmask[grinder->pcache_w] = w[2]; + grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 32; + grinder->pcache_w += (w[2] != 0); + + grinder->pcache_qmask[grinder->pcache_w] = w[3]; + grinder->pcache_qindex[grinder->pcache_w] = bmp_pos + 48; + grinder->pcache_w += (w[3] != 0); +} + +static inline void +grinder_tccache_populate(struct rte_sched_port *port, uint32_t pos, uint32_t qindex, uint16_t qmask) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + uint8_t b[4]; + + grinder->tccache_w = 0; + grinder->tccache_r = 0; + + b[0] = (uint8_t) (qmask & 0xF); + b[1] = (uint8_t) ((qmask >> 4) & 0xF); + b[2] = (uint8_t) ((qmask >> 8) & 0xF); + b[3] = (uint8_t) ((qmask >> 12) & 0xF); + + grinder->tccache_qmask[grinder->tccache_w] = b[0]; + grinder->tccache_qindex[grinder->tccache_w] = qindex; + grinder->tccache_w += (b[0] != 0); + + grinder->tccache_qmask[grinder->tccache_w] = b[1]; + grinder->tccache_qindex[grinder->tccache_w] = qindex + 4; + grinder->tccache_w += (b[1] != 0); + + grinder->tccache_qmask[grinder->tccache_w] = b[2]; + grinder->tccache_qindex[grinder->tccache_w] = qindex + 8; + grinder->tccache_w += (b[2] != 0); + + grinder->tccache_qmask[grinder->tccache_w] = b[3]; + grinder->tccache_qindex[grinder->tccache_w] = qindex + 12; + grinder->tccache_w += (b[3] != 0); +} + +static inline int +grinder_next_tc(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_mbuf **qbase; + uint32_t qindex; + uint16_t qsize; + + if (grinder->tccache_r == grinder->tccache_w) + return 0; + + qindex = grinder->tccache_qindex[grinder->tccache_r]; + qbase = rte_sched_port_qbase(port, qindex); + qsize = rte_sched_port_qsize(port, qindex); + + grinder->tc_index = (qindex >> 2) & 0x3; + grinder->qmask = grinder->tccache_qmask[grinder->tccache_r]; + grinder->qsize = qsize; + + grinder->qindex[0] = qindex; + grinder->qindex[1] = qindex + 1; + grinder->qindex[2] = qindex + 2; + grinder->qindex[3] = qindex + 3; + + grinder->queue[0] = port->queue + qindex; + grinder->queue[1] = port->queue + qindex + 1; + grinder->queue[2] = port->queue + qindex + 2; + grinder->queue[3] = port->queue + qindex + 3; + + grinder->qbase[0] = qbase; + grinder->qbase[1] = qbase + qsize; + grinder->qbase[2] = qbase + 2 * qsize; + grinder->qbase[3] = qbase + 3 * qsize; + + grinder->tccache_r++; + return 1; +} + +static inline int +grinder_next_pipe(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + uint32_t pipe_qindex; + uint16_t pipe_qmask; + + if (grinder->pcache_r < grinder->pcache_w) { + pipe_qmask = grinder->pcache_qmask[grinder->pcache_r]; + pipe_qindex = grinder->pcache_qindex[grinder->pcache_r]; + grinder->pcache_r++; + } else { + uint64_t bmp_slab = 0; + uint32_t bmp_pos = 0; + + /* Get another non-empty pipe group */ + if (unlikely(rte_bitmap_scan(port->bmp, &bmp_pos, &bmp_slab) <= 0)) + return 0; + +#ifdef RTE_SCHED_DEBUG + debug_check_queue_slab(port, bmp_pos, bmp_slab); +#endif + + /* Return if pipe group already in one of the other grinders */ + port->grinder_base_bmp_pos[pos] = RTE_SCHED_BMP_POS_INVALID; + if (unlikely(grinder_pipe_exists(port, bmp_pos))) + return 0; + + port->grinder_base_bmp_pos[pos] = bmp_pos; + + /* Install new pipe group into grinder's pipe cache */ + grinder_pcache_populate(port, pos, bmp_pos, bmp_slab); + + pipe_qmask = grinder->pcache_qmask[0]; + pipe_qindex = grinder->pcache_qindex[0]; + grinder->pcache_r = 1; + } + + /* Install new pipe in the grinder */ + grinder->pindex = pipe_qindex >> 4; + grinder->subport = port->subport + (grinder->pindex / port->n_pipes_per_subport); + grinder->pipe = port->pipe + grinder->pindex; + grinder->pipe_params = NULL; /* to be set after the pipe structure is prefetched */ + grinder->productive = 0; + + grinder_tccache_populate(port, pos, pipe_qindex, pipe_qmask); + grinder_next_tc(port, pos); + + /* Check for pipe exhaustion */ + if (grinder->pindex == port->pipe_loop) { + port->pipe_exhaustion = 1; + port->pipe_loop = RTE_SCHED_PIPE_INVALID; + } + + return 1; +} + + +static inline void +grinder_wrr_load(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_pipe *pipe = grinder->pipe; + struct rte_sched_pipe_profile *pipe_params = grinder->pipe_params; + uint32_t tc_index = grinder->tc_index; + uint32_t qmask = grinder->qmask; + uint32_t qindex; + + qindex = tc_index * 4; + + grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) << RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1]) << RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2]) << RTE_SCHED_WRR_SHIFT; + grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3]) << RTE_SCHED_WRR_SHIFT; + + grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF; + grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF; + grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF; + grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF; + + grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex]; + grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; + grinder->wrr_cost[2] = pipe_params->wrr_cost[qindex + 2]; + grinder->wrr_cost[3] = pipe_params->wrr_cost[qindex + 3]; +} + +static inline void +grinder_wrr_store(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + struct rte_sched_pipe *pipe = grinder->pipe; + uint32_t tc_index = grinder->tc_index; + uint32_t qindex; + + qindex = tc_index * 4; + + pipe->wrr_tokens[qindex] = (grinder->wrr_tokens[0] & grinder->wrr_mask[0]) + >> RTE_SCHED_WRR_SHIFT; + pipe->wrr_tokens[qindex + 1] = (grinder->wrr_tokens[1] & grinder->wrr_mask[1]) + >> RTE_SCHED_WRR_SHIFT; + pipe->wrr_tokens[qindex + 2] = (grinder->wrr_tokens[2] & grinder->wrr_mask[2]) + >> RTE_SCHED_WRR_SHIFT; + pipe->wrr_tokens[qindex + 3] = (grinder->wrr_tokens[3] & grinder->wrr_mask[3]) + >> RTE_SCHED_WRR_SHIFT; +} + +static inline void +grinder_wrr(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + uint16_t wrr_tokens_min; + + grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0]; + grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1]; + grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2]; + grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3]; + + grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens); + wrr_tokens_min = grinder->wrr_tokens[grinder->qpos]; + + grinder->wrr_tokens[0] -= wrr_tokens_min; + grinder->wrr_tokens[1] -= wrr_tokens_min; + grinder->wrr_tokens[2] -= wrr_tokens_min; + grinder->wrr_tokens[3] -= wrr_tokens_min; +} + + +#define grinder_evict(port, pos) + +static inline void +grinder_prefetch_pipe(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + + rte_prefetch0(grinder->pipe); + rte_prefetch0(grinder->queue[0]); +} + +static inline void +grinder_prefetch_tc_queue_arrays(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + uint16_t qsize, qr[4]; + + qsize = grinder->qsize; + qr[0] = grinder->queue[0]->qr & (qsize - 1); + qr[1] = grinder->queue[1]->qr & (qsize - 1); + qr[2] = grinder->queue[2]->qr & (qsize - 1); + qr[3] = grinder->queue[3]->qr & (qsize - 1); + + rte_prefetch0(grinder->qbase[0] + qr[0]); + rte_prefetch0(grinder->qbase[1] + qr[1]); + + grinder_wrr_load(port, pos); + grinder_wrr(port, pos); + + rte_prefetch0(grinder->qbase[2] + qr[2]); + rte_prefetch0(grinder->qbase[3] + qr[3]); +} + +static inline void +grinder_prefetch_mbuf(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + uint32_t qpos = grinder->qpos; + struct rte_mbuf **qbase = grinder->qbase[qpos]; + uint16_t qsize = grinder->qsize; + uint16_t qr = grinder->queue[qpos]->qr & (qsize - 1); + + grinder->pkt = qbase[qr]; + rte_prefetch0(grinder->pkt); + + if (unlikely((qr & 0x7) == 7)) { + uint16_t qr_next = (grinder->queue[qpos]->qr + 1) & (qsize - 1); + + rte_prefetch0(qbase + qr_next); + } +} + +static inline uint32_t +grinder_handle(struct rte_sched_port *port, uint32_t pos) +{ + struct rte_sched_grinder *grinder = port->grinder + pos; + + switch (grinder->state) { + case e_GRINDER_PREFETCH_PIPE: + { + if (grinder_next_pipe(port, pos)) { + grinder_prefetch_pipe(port, pos); + port->busy_grinders++; + + grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS; + return 0; + } + + return 0; + } + + case e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS: + { + struct rte_sched_pipe *pipe = grinder->pipe; + + grinder->pipe_params = port->pipe_profiles + pipe->profile; + grinder_prefetch_tc_queue_arrays(port, pos); + grinder_credits_update(port, pos); + + grinder->state = e_GRINDER_PREFETCH_MBUF; + return 0; + } + + case e_GRINDER_PREFETCH_MBUF: + { + grinder_prefetch_mbuf(port, pos); + + grinder->state = e_GRINDER_READ_MBUF; + return 0; + } + + case e_GRINDER_READ_MBUF: + { + uint32_t result = 0; + + result = grinder_schedule(port, pos); + + /* Look for next packet within the same TC */ + if (result && grinder->qmask) { + grinder_wrr(port, pos); + grinder_prefetch_mbuf(port, pos); + + return 1; + } + grinder_wrr_store(port, pos); + + /* Look for another active TC within same pipe */ + if (grinder_next_tc(port, pos)) { + grinder_prefetch_tc_queue_arrays(port, pos); + + grinder->state = e_GRINDER_PREFETCH_MBUF; + return result; + } + + if (grinder->productive == 0 && + port->pipe_loop == RTE_SCHED_PIPE_INVALID) + port->pipe_loop = grinder->pindex; + + grinder_evict(port, pos); + + /* Look for another active pipe */ + if (grinder_next_pipe(port, pos)) { + grinder_prefetch_pipe(port, pos); + + grinder->state = e_GRINDER_PREFETCH_TC_QUEUE_ARRAYS; + return result; + } + + /* No active pipe found */ + port->busy_grinders--; + + grinder->state = e_GRINDER_PREFETCH_PIPE; + return result; + } + + default: + rte_panic("Algorithmic error (invalid state)\n"); + return 0; + } +} + +static inline void +rte_sched_port_time_resync(struct rte_sched_port *port) +{ + uint64_t cycles = rte_get_tsc_cycles(); + uint64_t cycles_diff = cycles - port->time_cpu_cycles; + uint64_t bytes_diff; + + /* Compute elapsed time in bytes */ + bytes_diff = rte_reciprocal_divide(cycles_diff << RTE_SCHED_TIME_SHIFT, + port->inv_cycles_per_byte); + + /* Advance port time */ + port->time_cpu_cycles = cycles; + port->time_cpu_bytes += bytes_diff; + if (port->time < port->time_cpu_bytes) + port->time = port->time_cpu_bytes; + + /* Reset pipe loop detection */ + port->pipe_loop = RTE_SCHED_PIPE_INVALID; +} + +static inline int +rte_sched_port_exceptions(struct rte_sched_port *port, int second_pass) +{ + int exceptions; + + /* Check if any exception flag is set */ + exceptions = (second_pass && port->busy_grinders == 0) || + (port->pipe_exhaustion == 1); + + /* Clear exception flags */ + port->pipe_exhaustion = 0; + + return exceptions; +} + +int +rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts) +{ + uint32_t i, count; + + port->pkts_out = pkts; + port->n_pkts_out = 0; + + rte_sched_port_time_resync(port); + + /* Take each queue in the grinder one step further */ + for (i = 0, count = 0; ; i++) { + count += grinder_handle(port, i & (RTE_SCHED_PORT_N_GRINDERS - 1)); + if ((count == n_pkts) || + rte_sched_port_exceptions(port, i >= RTE_SCHED_PORT_N_GRINDERS)) { + break; + } + } + + return count; +} diff --git a/src/spdk/dpdk/lib/librte_sched/rte_sched.h b/src/spdk/dpdk/lib/librte_sched/rte_sched.h new file mode 100644 index 00000000..84fa896d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_sched.h @@ -0,0 +1,447 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_SCHED_H__ +#define __INCLUDE_RTE_SCHED_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Hierarchical Scheduler + * + * The hierarchical scheduler prioritizes the transmission of packets + * from different users and traffic classes according to the Service + * Level Agreements (SLAs) defined for the current network node. + * + * The scheduler supports thousands of packet queues grouped under a + * 5-level hierarchy: + * 1. Port: + * - Typical usage: output Ethernet port; + * - Multiple ports are scheduled in round robin order with + * equal priority; + * 2. Subport: + * - Typical usage: group of users; + * - Traffic shaping using the token bucket algorithm + * (one bucket per subport); + * - Upper limit enforced per traffic class at subport level; + * - Lower priority traffic classes able to reuse subport + * bandwidth currently unused by higher priority traffic + * classes of the same subport; + * - When any subport traffic class is oversubscribed + * (configuration time event), the usage of subport member + * pipes with high demand for thattraffic class pipes is + * truncated to a dynamically adjusted value with no + * impact to low demand pipes; + * 3. Pipe: + * - Typical usage: individual user/subscriber; + * - Traffic shaping using the token bucket algorithm + * (one bucket per pipe); + * 4. Traffic class: + * - Traffic classes of the same pipe handled in strict + * priority order; + * - Upper limit enforced per traffic class at the pipe level; + * - Lower priority traffic classes able to reuse pipe + * bandwidth currently unused by higher priority traffic + * classes of the same pipe; + * 5. Queue: + * - Typical usage: queue hosting packets from one or + * multiple connections of same traffic class belonging to + * the same user; + * - Weighted Round Robin (WRR) is used to service the + * queues within same pipe traffic class. + * + */ + +#include +#include +#include +#include + +/** Random Early Detection (RED) */ +#ifdef RTE_SCHED_RED +#include "rte_red.h" +#endif + +/** Number of traffic classes per pipe (as well as subport). + * Cannot be changed. + */ +#define RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE 4 + +/** Number of queues per pipe traffic class. Cannot be changed. */ +#define RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS 4 + +/** Number of queues per pipe. */ +#define RTE_SCHED_QUEUES_PER_PIPE \ + (RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE * \ + RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) + +/** Maximum number of pipe profiles that can be defined per port. + * Compile-time configurable. + */ +#ifndef RTE_SCHED_PIPE_PROFILES_PER_PORT +#define RTE_SCHED_PIPE_PROFILES_PER_PORT 256 +#endif + +/* + * Ethernet framing overhead. Overhead fields per Ethernet frame: + * 1. Preamble: 7 bytes; + * 2. Start of Frame Delimiter (SFD): 1 byte; + * 3. Frame Check Sequence (FCS): 4 bytes; + * 4. Inter Frame Gap (IFG): 12 bytes. + * + * The FCS is considered overhead only if not included in the packet + * length (field pkt_len of struct rte_mbuf). + */ +#ifndef RTE_SCHED_FRAME_OVERHEAD_DEFAULT +#define RTE_SCHED_FRAME_OVERHEAD_DEFAULT 24 +#endif + +/* + * Subport configuration parameters. The period and credits_per_period + * parameters are measured in bytes, with one byte meaning the time + * duration associated with the transmission of one byte on the + * physical medium of the output port, with pipe or pipe traffic class + * rate (measured as percentage of output port rate) determined as + * credits_per_period divided by period. One credit represents one + * byte. + */ +struct rte_sched_subport_params { + /* Subport token bucket */ + uint32_t tb_rate; /**< Rate (measured in bytes per second) */ + uint32_t tb_size; /**< Size (measured in credits) */ + + /* Subport traffic classes */ + uint32_t tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Traffic class rates (measured in bytes per second) */ + uint32_t tc_period; + /**< Enforcement period for rates (measured in milliseconds) */ +}; + +/** Subport statistics */ +struct rte_sched_subport_stats { + /* Packets */ + uint32_t n_pkts_tc[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Number of packets successfully written */ + uint32_t n_pkts_tc_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Number of packets dropped */ + + /* Bytes */ + uint32_t n_bytes_tc[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Number of bytes successfully written for each traffic class */ + uint32_t n_bytes_tc_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Number of bytes dropped for each traffic class */ + +#ifdef RTE_SCHED_RED + uint32_t n_pkts_red_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Number of packets dropped by red */ +#endif +}; + +/* + * Pipe configuration parameters. The period and credits_per_period + * parameters are measured in bytes, with one byte meaning the time + * duration associated with the transmission of one byte on the + * physical medium of the output port, with pipe or pipe traffic class + * rate (measured as percentage of output port rate) determined as + * credits_per_period divided by period. One credit represents one + * byte. + */ +struct rte_sched_pipe_params { + /* Pipe token bucket */ + uint32_t tb_rate; /**< Rate (measured in bytes per second) */ + uint32_t tb_size; /**< Size (measured in credits) */ + + /* Pipe traffic classes */ + uint32_t tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Traffic class rates (measured in bytes per second) */ + uint32_t tc_period; + /**< Enforcement period (measured in milliseconds) */ +#ifdef RTE_SCHED_SUBPORT_TC_OV + uint8_t tc_ov_weight; /**< Weight Traffic class 3 oversubscription */ +#endif + + /* Pipe queues */ + uint8_t wrr_weights[RTE_SCHED_QUEUES_PER_PIPE]; /**< WRR weights */ +}; + +/** Queue statistics */ +struct rte_sched_queue_stats { + /* Packets */ + uint32_t n_pkts; /**< Packets successfully written */ + uint32_t n_pkts_dropped; /**< Packets dropped */ +#ifdef RTE_SCHED_RED + uint32_t n_pkts_red_dropped; /**< Packets dropped by RED */ +#endif + + /* Bytes */ + uint32_t n_bytes; /**< Bytes successfully written */ + uint32_t n_bytes_dropped; /**< Bytes dropped */ +}; + +/** Port configuration parameters. */ +struct rte_sched_port_params { + const char *name; /**< String to be associated */ + int socket; /**< CPU socket ID */ + uint32_t rate; /**< Output port rate + * (measured in bytes per second) */ + uint32_t mtu; /**< Maximum Ethernet frame size + * (measured in bytes). + * Should not include the framing overhead. */ + uint32_t frame_overhead; /**< Framing overhead per packet + * (measured in bytes) */ + uint32_t n_subports_per_port; /**< Number of subports */ + uint32_t n_pipes_per_subport; /**< Number of pipes per subport */ + uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; + /**< Packet queue size for each traffic class. + * All queues within the same pipe traffic class have the same + * size. Queues from different pipes serving the same traffic + * class have the same size. */ + struct rte_sched_pipe_params *pipe_profiles; + /**< Pipe profile table. + * Every pipe is configured using one of the profiles from this table. */ + uint32_t n_pipe_profiles; /**< Profiles in the pipe profile table */ +#ifdef RTE_SCHED_RED + struct rte_red_params red_params[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][e_RTE_METER_COLORS]; /**< RED parameters */ +#endif +}; + +/* + * Configuration + * + ***/ + +/** + * Hierarchical scheduler port configuration + * + * @param params + * Port scheduler configuration parameter structure + * @return + * Handle to port scheduler instance upon success or NULL otherwise. + */ +struct rte_sched_port * +rte_sched_port_config(struct rte_sched_port_params *params); + +/** + * Hierarchical scheduler port free + * + * @param port + * Handle to port scheduler instance + */ +void +rte_sched_port_free(struct rte_sched_port *port); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. + * + * Hierarchical scheduler pipe profile add + * + * @param port + * Handle to port scheduler instance + * @param params + * Pipe profile parameters + * @param pipe_profile_id + * Set to valid profile id when profile is added successfully. + * @return + * 0 upon success, error code otherwise + */ +int __rte_experimental +rte_sched_port_pipe_profile_add(struct rte_sched_port *port, + struct rte_sched_pipe_params *params, + uint32_t *pipe_profile_id); + +/** + * Hierarchical scheduler subport configuration + * + * @param port + * Handle to port scheduler instance + * @param subport_id + * Subport ID + * @param params + * Subport configuration parameters + * @return + * 0 upon success, error code otherwise + */ +int +rte_sched_subport_config(struct rte_sched_port *port, + uint32_t subport_id, + struct rte_sched_subport_params *params); + +/** + * Hierarchical scheduler pipe configuration + * + * @param port + * Handle to port scheduler instance + * @param subport_id + * Subport ID + * @param pipe_id + * Pipe ID within subport + * @param pipe_profile + * ID of port-level pre-configured pipe profile + * @return + * 0 upon success, error code otherwise + */ +int +rte_sched_pipe_config(struct rte_sched_port *port, + uint32_t subport_id, + uint32_t pipe_id, + int32_t pipe_profile); + +/** + * Hierarchical scheduler memory footprint size per port + * + * @param params + * Port scheduler configuration parameter structure + * @return + * Memory footprint size in bytes upon success, 0 otherwise + */ +uint32_t +rte_sched_port_get_memory_footprint(struct rte_sched_port_params *params); + +/* + * Statistics + * + ***/ + +/** + * Hierarchical scheduler subport statistics read + * + * @param port + * Handle to port scheduler instance + * @param subport_id + * Subport ID + * @param stats + * Pointer to pre-allocated subport statistics structure where the statistics + * counters should be stored + * @param tc_ov + * Pointer to pre-allocated 4-entry array where the oversubscription status for + * each of the 4 subport traffic classes should be stored. + * @return + * 0 upon success, error code otherwise + */ +int +rte_sched_subport_read_stats(struct rte_sched_port *port, + uint32_t subport_id, + struct rte_sched_subport_stats *stats, + uint32_t *tc_ov); + +/** + * Hierarchical scheduler queue statistics read + * + * @param port + * Handle to port scheduler instance + * @param queue_id + * Queue ID within port scheduler + * @param stats + * Pointer to pre-allocated subport statistics structure where the statistics + * counters should be stored + * @param qlen + * Pointer to pre-allocated variable where the current queue length + * should be stored. + * @return + * 0 upon success, error code otherwise + */ +int +rte_sched_queue_read_stats(struct rte_sched_port *port, + uint32_t queue_id, + struct rte_sched_queue_stats *stats, + uint16_t *qlen); + +/** + * Scheduler hierarchy path write to packet descriptor. Typically + * called by the packet classification stage. + * + * @param pkt + * Packet descriptor handle + * @param subport + * Subport ID + * @param pipe + * Pipe ID within subport + * @param traffic_class + * Traffic class ID within pipe (0 .. 3) + * @param queue + * Queue ID within pipe traffic class (0 .. 3) + * @param color + * Packet color set + */ +void +rte_sched_port_pkt_write(struct rte_mbuf *pkt, + uint32_t subport, uint32_t pipe, uint32_t traffic_class, + uint32_t queue, enum rte_meter_color color); + +/** + * Scheduler hierarchy path read from packet descriptor (struct + * rte_mbuf). Typically called as part of the hierarchical scheduler + * enqueue operation. The subport, pipe, traffic class and queue + * parameters need to be pre-allocated by the caller. + * + * @param pkt + * Packet descriptor handle + * @param subport + * Subport ID + * @param pipe + * Pipe ID within subport + * @param traffic_class + * Traffic class ID within pipe (0 .. 3) + * @param queue + * Queue ID within pipe traffic class (0 .. 3) + * + */ +void +rte_sched_port_pkt_read_tree_path(const struct rte_mbuf *pkt, + uint32_t *subport, uint32_t *pipe, + uint32_t *traffic_class, uint32_t *queue); + +enum rte_meter_color +rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt); + +/** + * Hierarchical scheduler port enqueue. Writes up to n_pkts to port + * scheduler and returns the number of packets actually written. For + * each packet, the port scheduler queue to write the packet to is + * identified by reading the hierarchy path from the packet + * descriptor; if the queue is full or congested and the packet is not + * written to the queue, then the packet is automatically dropped + * without any action required from the caller. + * + * @param port + * Handle to port scheduler instance + * @param pkts + * Array storing the packet descriptor handles + * @param n_pkts + * Number of packets to enqueue from the pkts array into the port scheduler + * @return + * Number of packets successfully enqueued + */ +int +rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts); + +/** + * Hierarchical scheduler port dequeue. Reads up to n_pkts from the + * port scheduler and stores them in the pkts array and returns the + * number of packets actually read. The pkts array needs to be + * pre-allocated by the caller with at least n_pkts entries. + * + * @param port + * Handle to port scheduler instance + * @param pkts + * Pre-allocated packet descriptor array where the packets dequeued + * from the port + * scheduler should be stored + * @param n_pkts + * Number of packets to dequeue from the port scheduler + * @return + * Number of packets successfully dequeued and placed in the pkts array + */ +int +rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts); + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_SCHED_H__ */ diff --git a/src/spdk/dpdk/lib/librte_sched/rte_sched_common.h b/src/spdk/dpdk/lib/librte_sched/rte_sched_common.h new file mode 100644 index 00000000..8c191a9b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_sched_common.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_SCHED_COMMON_H__ +#define __INCLUDE_RTE_SCHED_COMMON_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#define __rte_aligned_16 __attribute__((__aligned__(16))) + +static inline uint32_t +rte_sched_min_val_2_u32(uint32_t x, uint32_t y) +{ + return (x < y)? x : y; +} + +#if 0 +static inline uint32_t +rte_min_pos_4_u16(uint16_t *x) +{ + uint32_t pos0, pos1; + + pos0 = (x[0] <= x[1])? 0 : 1; + pos1 = (x[2] <= x[3])? 2 : 3; + + return (x[pos0] <= x[pos1])? pos0 : pos1; +} + +#else + +/* simplified version to remove branches with CMOV instruction */ +static inline uint32_t +rte_min_pos_4_u16(uint16_t *x) +{ + uint32_t pos0 = 0; + uint32_t pos1 = 2; + + if (x[1] <= x[0]) pos0 = 1; + if (x[3] <= x[2]) pos1 = 3; + if (x[pos1] <= x[pos0]) pos0 = pos1; + + return pos0; +} + +#endif + +/* + * Compute the Greatest Common Divisor (GCD) of two numbers. + * This implementation uses Euclid's algorithm: + * gcd(a, 0) = a + * gcd(a, b) = gcd(b, a mod b) + * + */ +static inline uint32_t +rte_get_gcd(uint32_t a, uint32_t b) +{ + uint32_t c; + + if (a == 0) + return b; + if (b == 0) + return a; + + if (a < b) { + c = a; + a = b; + b = c; + } + + while (b != 0) { + c = a % b; + a = b; + b = c; + } + + return a; +} + +/* + * Compute the Lowest Common Denominator (LCD) of two numbers. + * This implementation computes GCD first: + * LCD(a, b) = (a * b) / GCD(a, b) + * + */ +static inline uint32_t +rte_get_lcd(uint32_t a, uint32_t b) +{ + return (a * b) / rte_get_gcd(a, b); +} + +#ifdef __cplusplus +} +#endif + +#endif /* __INCLUDE_RTE_SCHED_COMMON_H__ */ diff --git a/src/spdk/dpdk/lib/librte_sched/rte_sched_version.map b/src/spdk/dpdk/lib/librte_sched/rte_sched_version.map new file mode 100644 index 00000000..72958879 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_sched/rte_sched_version.map @@ -0,0 +1,37 @@ +DPDK_2.0 { + global: + + rte_approx; + rte_red_config_init; + rte_red_log2_1_minus_Wq; + rte_red_pow2_frac_inv; + rte_red_rand_seed; + rte_red_rand_val; + rte_red_rt_data_init; + rte_sched_pipe_config; + rte_sched_port_config; + rte_sched_port_dequeue; + rte_sched_port_enqueue; + rte_sched_port_free; + rte_sched_port_get_memory_footprint; + rte_sched_queue_read_stats; + rte_sched_subport_config; + rte_sched_subport_read_stats; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_sched_port_pkt_write; + rte_sched_port_pkt_read_tree_path; + rte_sched_port_pkt_read_color; + +} DPDK_2.0; + +EXPERIMENTAL { + global: + + rte_sched_port_pipe_profile_add; +}; diff --git a/src/spdk/dpdk/lib/librte_security/Makefile b/src/spdk/dpdk/lib/librte_security/Makefile new file mode 100644 index 00000000..8daebea4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_security/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_security.a + +# library version +LIBABIVER := 1 + +# build flags +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_eal -lrte_mempool + +# library source files +SRCS-y += rte_security.c + +# export include files +SYMLINK-y-include += rte_security.h +SYMLINK-y-include += rte_security_driver.h + +# versioning export map +EXPORT_MAP := rte_security_version.map + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_security/meson.build b/src/spdk/dpdk/lib/librte_security/meson.build new file mode 100644 index 00000000..4c85894c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_security/meson.build @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +allow_experimental_apis = true +sources = files('rte_security.c') +headers = files('rte_security.h', 'rte_security_driver.h') +deps += ['mempool', 'cryptodev'] diff --git a/src/spdk/dpdk/lib/librte_security/rte_security.c b/src/spdk/dpdk/lib/librte_security/rte_security.c new file mode 100644 index 00000000..1954960a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_security/rte_security.c @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. + */ + +#include +#include +#include "rte_compat.h" +#include "rte_security.h" +#include "rte_security_driver.h" + +struct rte_security_session * +__rte_experimental rte_security_session_create(struct rte_security_ctx *instance, + struct rte_security_session_conf *conf, + struct rte_mempool *mp) +{ + struct rte_security_session *sess = NULL; + + if (conf == NULL) + return NULL; + + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_create, NULL); + + if (rte_mempool_get(mp, (void **)&sess)) + return NULL; + + if (instance->ops->session_create(instance->device, conf, sess, mp)) { + rte_mempool_put(mp, (void *)sess); + return NULL; + } + instance->sess_cnt++; + + return sess; +} + +int __rte_experimental +rte_security_session_update(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_security_session_conf *conf) +{ + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_update, -ENOTSUP); + return instance->ops->session_update(instance->device, sess, conf); +} + +unsigned int __rte_experimental +rte_security_session_get_size(struct rte_security_ctx *instance) +{ + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_get_size, 0); + return instance->ops->session_get_size(instance->device); +} + +int __rte_experimental +rte_security_session_stats_get(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_security_stats *stats) +{ + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_stats_get, -ENOTSUP); + return instance->ops->session_stats_get(instance->device, sess, stats); +} + +int __rte_experimental +rte_security_session_destroy(struct rte_security_ctx *instance, + struct rte_security_session *sess) +{ + int ret; + + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_destroy, -ENOTSUP); + + if (instance->sess_cnt) + instance->sess_cnt--; + + ret = instance->ops->session_destroy(instance->device, sess); + if (!ret) + rte_mempool_put(rte_mempool_from_obj(sess), (void *)sess); + + return ret; +} + +int __rte_experimental +rte_security_set_pkt_metadata(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_mbuf *m, void *params) +{ + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->set_pkt_metadata, -ENOTSUP); + return instance->ops->set_pkt_metadata(instance->device, + sess, m, params); +} + +void * __rte_experimental +rte_security_get_userdata(struct rte_security_ctx *instance, uint64_t md) +{ + void *userdata = NULL; + + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->get_userdata, NULL); + if (instance->ops->get_userdata(instance->device, md, &userdata)) + return NULL; + + return userdata; +} + +const struct rte_security_capability * __rte_experimental +rte_security_capabilities_get(struct rte_security_ctx *instance) +{ + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->capabilities_get, NULL); + return instance->ops->capabilities_get(instance->device); +} + +const struct rte_security_capability * __rte_experimental +rte_security_capability_get(struct rte_security_ctx *instance, + struct rte_security_capability_idx *idx) +{ + const struct rte_security_capability *capabilities; + const struct rte_security_capability *capability; + uint16_t i = 0; + + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->capabilities_get, NULL); + capabilities = instance->ops->capabilities_get(instance->device); + + if (capabilities == NULL) + return NULL; + + while ((capability = &capabilities[i++])->action + != RTE_SECURITY_ACTION_TYPE_NONE) { + if (capability->action == idx->action && + capability->protocol == idx->protocol) { + if (idx->protocol == RTE_SECURITY_PROTOCOL_IPSEC) { + if (capability->ipsec.proto == + idx->ipsec.proto && + capability->ipsec.mode == + idx->ipsec.mode && + capability->ipsec.direction == + idx->ipsec.direction) + return capability; + } + } + } + + return NULL; +} diff --git a/src/spdk/dpdk/lib/librte_security/rte_security.h b/src/spdk/dpdk/lib/librte_security/rte_security.h new file mode 100644 index 00000000..b0d1b97e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_security/rte_security.h @@ -0,0 +1,543 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. + */ + +#ifndef _RTE_SECURITY_H_ +#define _RTE_SECURITY_H_ + +/** + * @file rte_security.h + * @b EXPERIMENTAL: this API may change without prior notice + * + * RTE Security Common Definitions + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/** IPSec protocol mode */ +enum rte_security_ipsec_sa_mode { + RTE_SECURITY_IPSEC_SA_MODE_TRANSPORT = 1, + /**< IPSec Transport mode */ + RTE_SECURITY_IPSEC_SA_MODE_TUNNEL, + /**< IPSec Tunnel mode */ +}; + +/** IPSec Protocol */ +enum rte_security_ipsec_sa_protocol { + RTE_SECURITY_IPSEC_SA_PROTO_AH = 1, + /**< AH protocol */ + RTE_SECURITY_IPSEC_SA_PROTO_ESP, + /**< ESP protocol */ +}; + +/** IPSEC tunnel type */ +enum rte_security_ipsec_tunnel_type { + RTE_SECURITY_IPSEC_TUNNEL_IPV4 = 1, + /**< Outer header is IPv4 */ + RTE_SECURITY_IPSEC_TUNNEL_IPV6, + /**< Outer header is IPv6 */ +}; + +/** + * Security context for crypto/eth devices + * + * Security instance for each driver to register security operations. + * The application can get the security context from the crypto/eth device id + * using the APIs rte_cryptodev_get_sec_ctx()/rte_eth_dev_get_sec_ctx() + * This structure is used to identify the device(crypto/eth) for which the + * security operations need to be performed. + */ +struct rte_security_ctx { + void *device; + /**< Crypto/ethernet device attached */ + const struct rte_security_ops *ops; + /**< Pointer to security ops for the device */ + uint16_t sess_cnt; + /**< Number of sessions attached to this context */ +}; + +/** + * IPSEC tunnel parameters + * + * These parameters are used to build outbound tunnel headers. + */ +struct rte_security_ipsec_tunnel_param { + enum rte_security_ipsec_tunnel_type type; + /**< Tunnel type: IPv4 or IPv6 */ + RTE_STD_C11 + union { + struct { + struct in_addr src_ip; + /**< IPv4 source address */ + struct in_addr dst_ip; + /**< IPv4 destination address */ + uint8_t dscp; + /**< IPv4 Differentiated Services Code Point */ + uint8_t df; + /**< IPv4 Don't Fragment bit */ + uint8_t ttl; + /**< IPv4 Time To Live */ + } ipv4; + /**< IPv4 header parameters */ + struct { + struct in6_addr src_addr; + /**< IPv6 source address */ + struct in6_addr dst_addr; + /**< IPv6 destination address */ + uint8_t dscp; + /**< IPv6 Differentiated Services Code Point */ + uint32_t flabel; + /**< IPv6 flow label */ + uint8_t hlimit; + /**< IPv6 hop limit */ + } ipv6; + /**< IPv6 header parameters */ + }; +}; + +/** + * IPsec Security Association option flags + */ +struct rte_security_ipsec_sa_options { + /**< Extended Sequence Numbers (ESN) + * + * * 1: Use extended (64 bit) sequence numbers + * * 0: Use normal sequence numbers + */ + uint32_t esn : 1; + + /**< UDP encapsulation + * + * * 1: Do UDP encapsulation/decapsulation so that IPSEC packets can + * traverse through NAT boxes. + * * 0: No UDP encapsulation + */ + uint32_t udp_encap : 1; + + /**< Copy DSCP bits + * + * * 1: Copy IPv4 or IPv6 DSCP bits from inner IP header to + * the outer IP header in encapsulation, and vice versa in + * decapsulation. + * * 0: Do not change DSCP field. + */ + uint32_t copy_dscp : 1; + + /**< Copy IPv6 Flow Label + * + * * 1: Copy IPv6 flow label from inner IPv6 header to the + * outer IPv6 header. + * * 0: Outer header is not modified. + */ + uint32_t copy_flabel : 1; + + /**< Copy IPv4 Don't Fragment bit + * + * * 1: Copy the DF bit from the inner IPv4 header to the outer + * IPv4 header. + * * 0: Outer header is not modified. + */ + uint32_t copy_df : 1; + + /**< Decrement inner packet Time To Live (TTL) field + * + * * 1: In tunnel mode, decrement inner packet IPv4 TTL or + * IPv6 Hop Limit after tunnel decapsulation, or before tunnel + * encapsulation. + * * 0: Inner packet is not modified. + */ + uint32_t dec_ttl : 1; +}; + +/** IPSec security association direction */ +enum rte_security_ipsec_sa_direction { + RTE_SECURITY_IPSEC_SA_DIR_EGRESS, + /**< Encrypt and generate digest */ + RTE_SECURITY_IPSEC_SA_DIR_INGRESS, + /**< Verify digest and decrypt */ +}; + +/** + * IPsec security association configuration data. + * + * This structure contains data required to create an IPsec SA security session. + */ +struct rte_security_ipsec_xform { + uint32_t spi; + /**< SA security parameter index */ + uint32_t salt; + /**< SA salt */ + struct rte_security_ipsec_sa_options options; + /**< various SA options */ + enum rte_security_ipsec_sa_direction direction; + /**< IPSec SA Direction - Egress/Ingress */ + enum rte_security_ipsec_sa_protocol proto; + /**< IPsec SA Protocol - AH/ESP */ + enum rte_security_ipsec_sa_mode mode; + /**< IPsec SA Mode - transport/tunnel */ + struct rte_security_ipsec_tunnel_param tunnel; + /**< Tunnel parameters, NULL for transport mode */ + uint64_t esn_soft_limit; + /**< ESN for which the overflow event need to be raised */ +}; + +/** + * MACsec security session configuration + */ +struct rte_security_macsec_xform { + /** To be Filled */ + int dummy; +}; + +/** + * Security session action type. + */ +enum rte_security_session_action_type { + RTE_SECURITY_ACTION_TYPE_NONE, + /**< No security actions */ + RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO, + /**< Crypto processing for security protocol is processed inline + * during transmission + */ + RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL, + /**< All security protocol processing is performed inline during + * transmission + */ + RTE_SECURITY_ACTION_TYPE_LOOKASIDE_PROTOCOL + /**< All security protocol processing including crypto is performed + * on a lookaside accelerator + */ +}; + +/** Security session protocol definition */ +enum rte_security_session_protocol { + RTE_SECURITY_PROTOCOL_IPSEC = 1, + /**< IPsec Protocol */ + RTE_SECURITY_PROTOCOL_MACSEC, + /**< MACSec Protocol */ +}; + +/** + * Security session configuration + */ +struct rte_security_session_conf { + enum rte_security_session_action_type action_type; + /**< Type of action to be performed on the session */ + enum rte_security_session_protocol protocol; + /**< Security protocol to be configured */ + RTE_STD_C11 + union { + struct rte_security_ipsec_xform ipsec; + struct rte_security_macsec_xform macsec; + }; + /**< Configuration parameters for security session */ + struct rte_crypto_sym_xform *crypto_xform; + /**< Security Session Crypto Transformations */ + void *userdata; + /**< Application specific userdata to be saved with session */ +}; + +struct rte_security_session { + void *sess_private_data; + /**< Private session material */ +}; + +/** + * Create security session as specified by the session configuration + * + * @param instance security instance + * @param conf session configuration parameters + * @param mp mempool to allocate session objects from + * @return + * - On success, pointer to session + * - On failure, NULL + */ +struct rte_security_session * __rte_experimental +rte_security_session_create(struct rte_security_ctx *instance, + struct rte_security_session_conf *conf, + struct rte_mempool *mp); + +/** + * Update security session as specified by the session configuration + * + * @param instance security instance + * @param sess session to update parameters + * @param conf update configuration parameters + * @return + * - On success returns 0 + * - On failure return errno + */ +int __rte_experimental +rte_security_session_update(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_security_session_conf *conf); + +/** + * Get the size of the security session data for a device. + * + * @param instance security instance. + * + * @return + * - Size of the private data, if successful + * - 0 if device is invalid or does not support the operation. + */ +unsigned int __rte_experimental +rte_security_session_get_size(struct rte_security_ctx *instance); + +/** + * Free security session header and the session private data and + * return it to its original mempool. + * + * @param instance security instance + * @param sess security session to freed + * + * @return + * - 0 if successful. + * - -EINVAL if session is NULL. + * - -EBUSY if not all device private data has been freed. + */ +int __rte_experimental +rte_security_session_destroy(struct rte_security_ctx *instance, + struct rte_security_session *sess); + +/** + * Updates the buffer with device-specific defined metadata + * + * @param instance security instance + * @param sess security session + * @param mb packet mbuf to set metadata on. + * @param params device-specific defined parameters + * required for metadata + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_security_set_pkt_metadata(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_mbuf *mb, void *params); + +/** + * Get userdata associated with the security session. Device specific metadata + * provided would be used to uniquely identify the security session being + * referred to. This userdata would be registered while creating the session, + * and application can use this to identify the SA etc. + * + * Device specific metadata would be set in mbuf for inline processed inbound + * packets. In addition, the same metadata would be set for IPsec events + * reported by rte_eth_event framework. + * + * @param instance security instance + * @param md device-specific metadata + * + * @return + * - On success, userdata + * - On failure, NULL + */ +void * __rte_experimental +rte_security_get_userdata(struct rte_security_ctx *instance, uint64_t md); + +/** + * Attach a session to a symmetric crypto operation + * + * @param sym_op crypto operation + * @param sess security session + */ +static inline int __rte_experimental +__rte_security_attach_session(struct rte_crypto_sym_op *sym_op, + struct rte_security_session *sess) +{ + sym_op->sec_session = sess; + + return 0; +} + +static inline void * __rte_experimental +get_sec_session_private_data(const struct rte_security_session *sess) +{ + return sess->sess_private_data; +} + +static inline void __rte_experimental +set_sec_session_private_data(struct rte_security_session *sess, + void *private_data) +{ + sess->sess_private_data = private_data; +} + +/** + * Attach a session to a crypto operation. + * This API is needed only in case of RTE_SECURITY_SESS_CRYPTO_PROTO_OFFLOAD + * For other rte_security_session_action_type, ol_flags in rte_mbuf may be + * defined to perform security operations. + * + * @param op crypto operation + * @param sess security session + */ +static inline int __rte_experimental +rte_security_attach_session(struct rte_crypto_op *op, + struct rte_security_session *sess) +{ + if (unlikely(op->type != RTE_CRYPTO_OP_TYPE_SYMMETRIC)) + return -EINVAL; + + op->sess_type = RTE_CRYPTO_OP_SECURITY_SESSION; + + return __rte_security_attach_session(op->sym, sess); +} + +struct rte_security_macsec_stats { + uint64_t reserved; +}; + +struct rte_security_ipsec_stats { + uint64_t reserved; + +}; + +struct rte_security_stats { + enum rte_security_session_protocol protocol; + /**< Security protocol to be configured */ + + RTE_STD_C11 + union { + struct rte_security_macsec_stats macsec; + struct rte_security_ipsec_stats ipsec; + }; +}; + +/** + * Get security session statistics + * + * @param instance security instance + * @param sess security session + * @param stats statistics + * @return + * - On success return 0 + * - On failure errno + */ +int __rte_experimental +rte_security_session_stats_get(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_security_stats *stats); + +/** + * Security capability definition + */ +struct rte_security_capability { + enum rte_security_session_action_type action; + /**< Security action type*/ + enum rte_security_session_protocol protocol; + /**< Security protocol */ + RTE_STD_C11 + union { + struct { + enum rte_security_ipsec_sa_protocol proto; + /**< IPsec SA protocol */ + enum rte_security_ipsec_sa_mode mode; + /**< IPsec SA mode */ + enum rte_security_ipsec_sa_direction direction; + /**< IPsec SA direction */ + struct rte_security_ipsec_sa_options options; + /**< IPsec SA supported options */ + } ipsec; + /**< IPsec capability */ + struct { + /* To be Filled */ + int dummy; + } macsec; + /**< MACsec capability */ + }; + + const struct rte_cryptodev_capabilities *crypto_capabilities; + /**< Corresponding crypto capabilities for security capability */ + + uint32_t ol_flags; + /**< Device offload flags */ +}; + +#define RTE_SECURITY_TX_OLOAD_NEED_MDATA 0x00000001 +/**< HW needs metadata update, see rte_security_set_pkt_metadata(). + */ + +#define RTE_SECURITY_TX_HW_TRAILER_OFFLOAD 0x00000002 +/**< HW constructs trailer of packets + * Transmitted packets will have the trailer added to them + * by hardawre. The next protocol field will be based on + * the mbuf->inner_esp_next_proto field. + */ +#define RTE_SECURITY_RX_HW_TRAILER_OFFLOAD 0x00010000 +/**< HW removes trailer of packets + * Received packets have no trailer, the next protocol field + * is supplied in the mbuf->inner_esp_next_proto field. + * Inner packet is not modified. + */ + +/** + * Security capability index used to query a security instance for a specific + * security capability + */ +struct rte_security_capability_idx { + enum rte_security_session_action_type action; + enum rte_security_session_protocol protocol; + + RTE_STD_C11 + union { + struct { + enum rte_security_ipsec_sa_protocol proto; + enum rte_security_ipsec_sa_mode mode; + enum rte_security_ipsec_sa_direction direction; + } ipsec; + }; +}; + +/** + * Returns array of security instance capabilities + * + * @param instance Security instance. + * + * @return + * - Returns array of security capabilities. + * - Return NULL if no capabilities available. + */ +const struct rte_security_capability * __rte_experimental +rte_security_capabilities_get(struct rte_security_ctx *instance); + +/** + * Query if a specific capability is available on security instance + * + * @param instance security instance. + * @param idx security capability index to match against + * + * @return + * - Returns pointer to security capability on match of capability + * index criteria. + * - Return NULL if the capability not matched on security instance. + */ +const struct rte_security_capability * __rte_experimental +rte_security_capability_get(struct rte_security_ctx *instance, + struct rte_security_capability_idx *idx); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SECURITY_H_ */ diff --git a/src/spdk/dpdk/lib/librte_security/rte_security_driver.h b/src/spdk/dpdk/lib/librte_security/rte_security_driver.h new file mode 100644 index 00000000..42f42ffe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_security/rte_security_driver.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. + */ + +#ifndef _RTE_SECURITY_DRIVER_H_ +#define _RTE_SECURITY_DRIVER_H_ + +/** + * @file rte_security_driver.h + * @b EXPERIMENTAL: this API may change without prior notice + * + * RTE Security Common Definitions + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "rte_security.h" + +/** + * Configure a security session on a device. + * + * @param device Crypto/eth device pointer + * @param conf Security session configuration + * @param sess Pointer to Security private session structure + * @param mp Mempool where the private session is allocated + * + * @return + * - Returns 0 if private session structure have been created successfully. + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if crypto device does not support the crypto transform. + * - Returns -ENOMEM if the private session could not be allocated. + */ +typedef int (*security_session_create_t)(void *device, + struct rte_security_session_conf *conf, + struct rte_security_session *sess, + struct rte_mempool *mp); + +/** + * Free driver private session data. + * + * @param dev Crypto/eth device pointer + * @param sess Security session structure + */ +typedef int (*security_session_destroy_t)(void *device, + struct rte_security_session *sess); + +/** + * Update driver private session data. + * + * @param device Crypto/eth device pointer + * @param sess Pointer to Security private session structure + * @param conf Security session configuration + * + * @return + * - Returns 0 if private session structure have been updated successfully. + * - Returns -EINVAL if input parameters are invalid. + * - Returns -ENOTSUP if crypto device does not support the crypto transform. + */ +typedef int (*security_session_update_t)(void *device, + struct rte_security_session *sess, + struct rte_security_session_conf *conf); + +/** + * Get the size of a security session + * + * @param device Crypto/eth device pointer + * + * @return + * - On success returns the size of the session structure for device + * - On failure returns 0 + */ +typedef unsigned int (*security_session_get_size)(void *device); + +/** + * Get stats from the PMD. + * + * @param device Crypto/eth device pointer + * @param sess Pointer to Security private session structure + * @param stats Security stats of the driver + * + * @return + * - Returns 0 if private session structure have been updated successfully. + * - Returns -EINVAL if session parameters are invalid. + */ +typedef int (*security_session_stats_get_t)(void *device, + struct rte_security_session *sess, + struct rte_security_stats *stats); + +/** + * Update the mbuf with provided metadata. + * + * @param sess Security session structure + * @param mb Packet buffer + * @param mt Metadata + * + * @return + * - Returns 0 if metadata updated successfully. + * - Returns -ve value for errors. + */ +typedef int (*security_set_pkt_metadata_t)(void *device, + struct rte_security_session *sess, struct rte_mbuf *m, + void *params); + +/** + * Get application specific userdata associated with the security session. + * Device specific metadata provided would be used to uniquely identify + * the security session being referred to. + * + * @param device Crypto/eth device pointer + * @param md Metadata + * @param userdata Pointer to receive userdata + * + * @return + * - Returns 0 if userdata is retrieved successfully. + * - Returns -ve value for errors. + */ +typedef int (*security_get_userdata_t)(void *device, + uint64_t md, void **userdata); + +/** + * Get security capabilities of the device. + * + * @param device crypto/eth device pointer + * + * @return + * - Returns rte_security_capability pointer on success. + * - Returns NULL on error. + */ +typedef const struct rte_security_capability *(*security_capabilities_get_t)( + void *device); + +/** Security operations function pointer table */ +struct rte_security_ops { + security_session_create_t session_create; + /**< Configure a security session. */ + security_session_update_t session_update; + /**< Update a security session. */ + security_session_get_size session_get_size; + /**< Return size of security session. */ + security_session_stats_get_t session_stats_get; + /**< Get security session statistics. */ + security_session_destroy_t session_destroy; + /**< Clear a security sessions private data. */ + security_set_pkt_metadata_t set_pkt_metadata; + /**< Update mbuf metadata. */ + security_get_userdata_t get_userdata; + /**< Get userdata associated with session which processed the packet. */ + security_capabilities_get_t capabilities_get; + /**< Get security capabilities. */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_SECURITY_DRIVER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_security/rte_security_version.map b/src/spdk/dpdk/lib/librte_security/rte_security_version.map new file mode 100644 index 00000000..5a1c8ae3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_security/rte_security_version.map @@ -0,0 +1,16 @@ +EXPERIMENTAL { + global: + + rte_security_attach_session; + rte_security_capabilities_get; + rte_security_capability_get; + rte_security_get_userdata; + rte_security_session_create; + rte_security_session_destroy; + rte_security_session_get_size; + rte_security_session_stats_get; + rte_security_session_update; + rte_security_set_pkt_metadata; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_table/Makefile b/src/spdk/dpdk/lib/librte_table/Makefile new file mode 100644 index 00000000..276d476a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/Makefile @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2016 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_table.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_port +LDLIBS += -lrte_lpm -lrte_hash +ifeq ($(CONFIG_RTE_LIBRTE_ACL),y) +LDLIBS += -lrte_acl +endif + +EXPORT_MAP := rte_table_version.map + +LIBABIVER := 3 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_lpm.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_lpm_ipv6.c +ifeq ($(CONFIG_RTE_LIBRTE_ACL),y) +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_acl.c +endif +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_hash_cuckoo.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_hash_key8.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_hash_key16.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_hash_key32.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_hash_ext.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_hash_lru.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_array.c +SRCS-$(CONFIG_RTE_LIBRTE_TABLE) += rte_table_stub.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_lpm.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_lpm_ipv6.h +ifeq ($(CONFIG_RTE_LIBRTE_ACL),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_acl.h +endif +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_hash_cuckoo.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru.h +ifeq ($(CONFIG_RTE_ARCH_X86),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru_x86.h +endif +ifeq ($(CONFIG_RTE_ARCH_ARM64),y) +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_lru_arm64.h +endif +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_array.h +SYMLINK-$(CONFIG_RTE_LIBRTE_TABLE)-include += rte_table_stub.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_table/meson.build b/src/spdk/dpdk/lib/librte_table/meson.build new file mode 100644 index 00000000..8b2f8413 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/meson.build @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +version = 3 +sources = files('rte_table_acl.c', + 'rte_table_lpm.c', + 'rte_table_lpm_ipv6.c', + 'rte_table_hash_cuckoo.c', + 'rte_table_hash_key8.c', + 'rte_table_hash_key16.c', + 'rte_table_hash_key32.c', + 'rte_table_hash_ext.c', + 'rte_table_hash_lru.c', + 'rte_table_array.c', + 'rte_table_stub.c') +headers = files('rte_table.h', + 'rte_table_acl.h', + 'rte_table_lpm.h', + 'rte_table_lpm_ipv6.h', + 'rte_table_hash.h', + 'rte_table_hash_cuckoo.h', + 'rte_lru.h', + 'rte_table_array.h', + 'rte_table_stub.h') +deps += ['mbuf', 'port', 'lpm', 'hash', 'acl'] + +if arch_subdir == 'x86' + headers += files('rte_lru_x86.h') +endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_lru.h b/src/spdk/dpdk/lib/librte_table/rte_lru.h new file mode 100644 index 00000000..88229d86 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_lru.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_LRU_H__ +#define __INCLUDE_RTE_LRU_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#ifdef RTE_ARCH_X86_64 +#include "rte_lru_x86.h" +#elif defined(RTE_ARCH_ARM64) +#include "rte_lru_arm64.h" +#else +#undef RTE_TABLE_HASH_LRU_STRATEGY +#define RTE_TABLE_HASH_LRU_STRATEGY 1 +#endif + +#if RTE_TABLE_HASH_LRU_STRATEGY == 0 + +#define lru_init(bucket) \ +do \ + bucket = bucket; \ +while (0) + +#define lru_pos(bucket) (bucket->lru_list & 0xFFFFLLU) + +#define lru_update(bucket, mru_val) \ +do { \ + bucket = bucket; \ + mru_val = mru_val; \ +} while (0) + +#elif RTE_TABLE_HASH_LRU_STRATEGY == 1 + +#define lru_init(bucket) \ +do \ + bucket->lru_list = 0x0000000100020003LLU; \ +while (0) + +#define lru_pos(bucket) (bucket->lru_list & 0xFFFFLLU) + +#define lru_update(bucket, mru_val) \ +do { \ + uint64_t x, pos, x0, x1, x2, mask; \ + \ + x = bucket->lru_list; \ + \ + pos = 4; \ + if ((x >> 48) == ((uint64_t) mru_val)) \ + pos = 3; \ + \ + if (((x >> 32) & 0xFFFFLLU) == ((uint64_t) mru_val)) \ + pos = 2; \ + \ + if (((x >> 16) & 0xFFFFLLU) == ((uint64_t) mru_val)) \ + pos = 1; \ + \ + if ((x & 0xFFFFLLU) == ((uint64_t) mru_val)) \ + pos = 0; \ + \ + \ + pos <<= 4; \ + mask = (~0LLU) << pos; \ + x0 = x & (~mask); \ + x1 = (x >> 16) & mask; \ + x2 = (x << (48 - pos)) & (0xFFFFLLU << 48); \ + x = x0 | x1 | x2; \ + \ + if (pos != 64) \ + bucket->lru_list = x; \ +} while (0) + +#elif (RTE_TABLE_HASH_LRU_STRATEGY == 2) || (RTE_TABLE_HASH_LRU_STRATEGY == 3) + +/** + * These strategies are implemented in architecture specific header files. + */ + +#else + +#error "Incorrect value for RTE_TABLE_HASH_LRU_STRATEGY" + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_lru_arm64.h b/src/spdk/dpdk/lib/librte_table/rte_lru_arm64.h new file mode 100644 index 00000000..b45e9d03 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_lru_arm64.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#ifndef __RTE_LRU_ARM64_H__ +#define __RTE_LRU_ARM64_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#ifndef RTE_TABLE_HASH_LRU_STRATEGY +#ifdef RTE_MACHINE_CPUFLAG_NEON +#define RTE_TABLE_HASH_LRU_STRATEGY 3 +#else /* if no NEON, use simple scalar version */ +#define RTE_TABLE_HASH_LRU_STRATEGY 1 +#endif +#endif + +#if RTE_TABLE_HASH_LRU_STRATEGY == 3 + +#define lru_init(bucket) \ + { bucket->lru_list = ~0LLU; } + +static inline int +f_lru_pos(uint64_t lru_list) +{ + /* Compare the vector to zero vector */ + uint16x4_t lru_vec = vld1_u16((uint16_t *)&lru_list); + uint16x4_t min_vec = vmov_n_u16(vminv_u16(lru_vec)); + uint64_t mask = vget_lane_u64(vreinterpret_u64_u16( + vceq_u16(min_vec, lru_vec)), 0); + return __builtin_clzl(mask) >> 4; +} +#define lru_pos(bucket) f_lru_pos(bucket->lru_list) + +#define lru_update(bucket, mru_val) \ +do { \ + const uint64_t orvals[] = {0xFFFFLLU, 0xFFFFLLU << 16, \ + 0xFFFFLLU << 32, 0xFFFFLLU << 48, 0LLU}; \ + const uint64_t decs[] = {0x1000100010001LLU, 0}; \ + uint64x1_t lru = vdup_n_u64(bucket->lru_list); \ + uint64x1_t vdec = vdup_n_u64(decs[mru_val>>2]); \ + bucket->lru_list = vget_lane_u64(vreinterpret_u64_u16( \ + vsub_u16(vreinterpret_u16_u64(lru), \ + vreinterpret_u16_u64(vdec))), \ + 0); \ + bucket->lru_list |= orvals[mru_val]; \ +} while (0) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_lru_x86.h b/src/spdk/dpdk/lib/librte_table/rte_lru_x86.h new file mode 100644 index 00000000..7a67ee8b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_lru_x86.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_LRU_X86_H__ +#define __INCLUDE_RTE_LRU_X86_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include + +#ifndef RTE_TABLE_HASH_LRU_STRATEGY +#define RTE_TABLE_HASH_LRU_STRATEGY 2 +#endif + +#if RTE_TABLE_HASH_LRU_STRATEGY == 2 + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306) +#include +#else +#include +#include +#include +#endif + +#define lru_init(bucket) \ + { bucket->lru_list = 0x0000000100020003LLU; } + +#define lru_pos(bucket) (bucket->lru_list & 0xFFFFLLU) + +#define lru_update(bucket, mru_val) \ +do { \ + /* set up the masks for all possible shuffles, depends on pos */\ + static uint64_t masks[10] = { \ + /* Shuffle order; Make Zero (see _mm_shuffle_epi8 manual) */\ + 0x0100070605040302, 0x8080808080808080, \ + 0x0302070605040100, 0x8080808080808080, \ + 0x0504070603020100, 0x8080808080808080, \ + 0x0706050403020100, 0x8080808080808080, \ + 0x0706050403020100, 0x8080808080808080}; \ + /* load up one register with repeats of mru-val */ \ + uint64_t mru2 = mru_val; \ + uint64_t mru3 = mru2 | (mru2 << 16); \ + uint64_t lru = bucket->lru_list; \ + /* XOR to cause the word we're looking for to go to zero */ \ + uint64_t mru = lru ^ ((mru3 << 32) | mru3); \ + __m128i c = _mm_cvtsi64_si128(mru); \ + __m128i b = _mm_cvtsi64_si128(lru); \ + /* Find the minimum value (first zero word, if it's in there) */\ + __m128i d = _mm_minpos_epu16(c); \ + /* Second word is the index to found word (first word is the value) */\ + unsigned int pos = _mm_extract_epi16(d, 1); \ + /* move the recently used location to top of list */ \ + __m128i k = _mm_shuffle_epi8(b, *((__m128i *) &masks[2 * pos]));\ + /* Finally, update the original list with the reordered data */ \ + bucket->lru_list = _mm_extract_epi64(k, 0); \ + /* Phwew! */ \ +} while (0) + +#elif RTE_TABLE_HASH_LRU_STRATEGY == 3 + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306) +#include +#else +#include +#include +#include +#endif + +#define lru_init(bucket) \ + { bucket->lru_list = ~0LLU; } + +static inline int +f_lru_pos(uint64_t lru_list) +{ + __m128i lst = _mm_set_epi64x((uint64_t)-1, lru_list); + __m128i min = _mm_minpos_epu16(lst); + return _mm_extract_epi16(min, 1); +} +#define lru_pos(bucket) f_lru_pos(bucket->lru_list) + +#define lru_update(bucket, mru_val) \ +do { \ + const uint64_t orvals[] = {0xFFFFLLU, 0xFFFFLLU << 16, \ + 0xFFFFLLU << 32, 0xFFFFLLU << 48, 0LLU}; \ + const uint64_t decs[] = {0x1000100010001LLU, 0}; \ + __m128i lru = _mm_cvtsi64_si128(bucket->lru_list); \ + __m128i vdec = _mm_cvtsi64_si128(decs[mru_val>>2]); \ + lru = _mm_subs_epu16(lru, vdec); \ + bucket->lru_list = _mm_extract_epi64(lru, 0) | orvals[mru_val]; \ +} while (0) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table.h b/src/spdk/dpdk/lib/librte_table/rte_table.h new file mode 100644 index 00000000..cccded1a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table.h @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_H__ +#define __INCLUDE_RTE_TABLE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table + * + * This tool is part of the DPDK Packet Framework tool suite and provides + * a standard interface to implement different types of lookup tables for data + * plane processing. + * + * Virtually any search algorithm that can uniquely associate data to a lookup + * key can be fitted under this lookup table abstraction. For the flow table + * use-case, the lookup key is an n-tuple of packet fields that uniquely + * identifies a traffic flow, while data represents actions and action + * meta-data associated with the same traffic flow. + * + ***/ + +#include +#include + +struct rte_mbuf; + +/** Lookup table statistics */ +struct rte_table_stats { + uint64_t n_pkts_in; + uint64_t n_pkts_lookup_miss; +}; + +/** + * Lookup table create + * + * @param params + * Parameters for lookup table creation. The underlying data structure is + * different for each lookup table type. + * @param socket_id + * CPU socket ID (e.g. for memory allocation purpose) + * @param entry_size + * Data size of each lookup table entry (measured in bytes) + * @return + * Handle to lookup table instance + */ +typedef void* (*rte_table_op_create)(void *params, int socket_id, + uint32_t entry_size); + +/** + * Lookup table free + * + * @param table + * Handle to lookup table instance + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_table_op_free)(void *table); + +/** + * Lookup table entry add + * + * @param table + * Handle to lookup table instance + * @param key + * Lookup key + * @param entry + * Data to be associated with the current key. This parameter has to point to + * a valid memory buffer where the first entry_size bytes (table create + * parameter) are populated with the data. + * @param key_found + * After successful invocation, *key_found is set to a value different than 0 + * if the current key is already present in the table and to 0 if not. This + * pointer has to be set to a valid memory location before the table entry add + * function is called. + * @param entry_ptr + * After successful invocation, *entry_ptr stores the handle to the table + * entry containing the data associated with the current key. This handle can + * be used to perform further read-write accesses to this entry. This handle + * is valid until the key is deleted from the table or the same key is + * re-added to the table, typically to associate it with different data. This + * pointer has to be set to a valid memory location before the function is + * called. + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_table_op_entry_add)( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr); + +/** + * Lookup table entry delete + * + * @param table + * Handle to lookup table instance + * @param key + * Lookup key + * @param key_found + * After successful invocation, *key_found is set to a value different than 0 + * if the current key was present in the table before the delete operation + * was performed and to 0 if not. This pointer has to be set to a valid + * memory location before the table entry delete function is called. + * @param entry + * After successful invocation, if the key is found in the table (*key found + * is different than 0 after function call is completed) and entry points to + * a valid buffer (entry is set to a value different than NULL before the + * function is called), then the first entry_size bytes (table create + * parameter) in *entry store a copy of table entry that contained the data + * associated with the current key before the key was deleted. + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_table_op_entry_delete)( + void *table, + void *key, + int *key_found, + void *entry); + +/** + * Lookup table entry add bulk + * + * @param table + * Handle to lookup table instance + * @param key + * Array containing lookup keys + * @param entries + * Array containing data to be associated with each key. Every item in the + * array has to point to a valid memory buffer where the first entry_size + * bytes (table create parameter) are populated with the data. + * @param n_keys + * Number of keys to add + * @param key_found + * After successful invocation, key_found for every item in the array is set + * to a value different than 0 if the current key is already present in the + * table and to 0 if not. This pointer has to be set to a valid memory + * location before the table entry add function is called. + * @param entries_ptr + * After successful invocation, array *entries_ptr stores the handle to the + * table entry containing the data associated with every key. This handle can + * be used to perform further read-write accesses to this entry. This handle + * is valid until the key is deleted from the table or the same key is + * re-added to the table, typically to associate it with different data. This + * pointer has to be set to a valid memory location before the function is + * called. + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_table_op_entry_add_bulk)( + void *table, + void **keys, + void **entries, + uint32_t n_keys, + int *key_found, + void **entries_ptr); + +/** + * Lookup table entry delete bulk + * + * @param table + * Handle to lookup table instance + * @param key + * Array containing lookup keys + * @param n_keys + * Number of keys to delete + * @param key_found + * After successful invocation, key_found for every item in the array is set + * to a value different than 0if the current key was present in the table + * before the delete operation was performed and to 0 if not. This pointer + * has to be set to a valid memory location before the table entry delete + * function is called. + * @param entries + * If entries pointer is NULL, this pointer is ignored for every entry found. + * Else, after successful invocation, if specific key is found in the table + * (key_found is different than 0 for this item after function call is + * completed) and item of entry array points to a valid buffer (entry is set + * to a value different than NULL before the function is called), then the + * first entry_size bytes (table create parameter) in *entry store a copy of + * table entry that contained the data associated with the current key before + * the key was deleted. + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_table_op_entry_delete_bulk)( + void *table, + void **keys, + uint32_t n_keys, + int *key_found, + void **entries); + +/** + * Lookup table lookup + * + * @param table + * Handle to lookup table instance + * @param pkts + * Burst of input packets specified as array of up to 64 pointers to struct + * rte_mbuf + * @param pkts_mask + * 64-bit bitmask specifying which packets in the input burst are valid. When + * pkts_mask bit n is set, then element n of pkts array is pointing to a + * valid packet. Otherwise, element n of pkts array does not point to a valid + * packet, therefore it will not be accessed. + * @param lookup_hit_mask + * Once the table lookup operation is completed, this 64-bit bitmask + * specifies which of the valid packets in the input burst resulted in lookup + * hit. For each valid input packet (pkts_mask bit n is set), the following + * are true on lookup hit: lookup_hit_mask bit n is set, element n of entries + * array is valid and it points to the lookup table entry that was hit. For + * each valid input packet (pkts_mask bit n is set), the following are true + * on lookup miss: lookup_hit_mask bit n is not set and element n of entries + * array is not valid. + * @param entries + * Once the table lookup operation is completed, this array provides the + * lookup table entries that were hit, as described above. It is required + * that this array is always pre-allocated by the caller of this function + * with exactly 64 elements. The implementation is allowed to speculatively + * modify the elements of this array, so elements marked as invalid in + * lookup_hit_mask once the table lookup operation is completed might have + * been modified by this function. + * @return + * 0 on success, error code otherwise + */ +typedef int (*rte_table_op_lookup)( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries); + +/** + * Lookup table stats read + * + * @param table + * Handle to lookup table instance + * @param stats + * Handle to table stats struct to copy data + * @param clear + * Flag indicating that stats should be cleared after read + * + * @return + * Error code or 0 on success. + */ +typedef int (*rte_table_op_stats_read)( + void *table, + struct rte_table_stats *stats, + int clear); + +/** Lookup table interface defining the lookup table operation */ +struct rte_table_ops { + rte_table_op_create f_create; /**< Create */ + rte_table_op_free f_free; /**< Free */ + rte_table_op_entry_add f_add; /**< Entry add */ + rte_table_op_entry_delete f_delete; /**< Entry delete */ + rte_table_op_entry_add_bulk f_add_bulk; /**< Add entry bulk */ + rte_table_op_entry_delete_bulk f_delete_bulk; /**< Delete entry bulk */ + rte_table_op_lookup f_lookup; /**< Lookup */ + rte_table_op_stats_read f_stats; /**< Stats */ +}; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_acl.c b/src/spdk/dpdk/lib/librte_table/rte_table_acl.c new file mode 100644 index 00000000..14d54019 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_acl.c @@ -0,0 +1,798 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_acl.h" +#include + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_ACL_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_ACL_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_ACL_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_ACL_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_table_acl { + struct rte_table_stats stats; + + /* Low-level ACL table */ + char name[2][RTE_ACL_NAMESIZE]; + struct rte_acl_param acl_params; /* for creating low level acl table */ + struct rte_acl_config cfg; /* Holds the field definitions (metadata) */ + struct rte_acl_ctx *ctx; + uint32_t name_id; + + /* Input parameters */ + uint32_t n_rules; + uint32_t entry_size; + + /* Internal tables */ + uint8_t *action_table; + struct rte_acl_rule **acl_rule_list; /* Array of pointers to rules */ + uint8_t *acl_rule_memory; /* Memory to store the rules */ + + /* Memory to store the action table and stack of free entries */ + uint8_t memory[0] __rte_cache_aligned; +}; + + +static void * +rte_table_acl_create( + void *params, + int socket_id, + uint32_t entry_size) +{ + struct rte_table_acl_params *p = params; + struct rte_table_acl *acl; + uint32_t action_table_size, acl_rule_list_size, acl_rule_memory_size; + uint32_t total_size; + + RTE_BUILD_BUG_ON(((sizeof(struct rte_table_acl) % RTE_CACHE_LINE_SIZE) + != 0)); + + /* Check input parameters */ + if (p == NULL) { + RTE_LOG(ERR, TABLE, "%s: Invalid value for params\n", __func__); + return NULL; + } + if (p->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: Invalid value for name\n", __func__); + return NULL; + } + if (p->n_rules == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid value for n_rules\n", + __func__); + return NULL; + } + if ((p->n_rule_fields == 0) || + (p->n_rule_fields > RTE_ACL_MAX_FIELDS)) { + RTE_LOG(ERR, TABLE, "%s: Invalid value for n_rule_fields\n", + __func__); + return NULL; + } + + entry_size = RTE_ALIGN(entry_size, sizeof(uint64_t)); + + /* Memory allocation */ + action_table_size = RTE_CACHE_LINE_ROUNDUP(p->n_rules * entry_size); + acl_rule_list_size = + RTE_CACHE_LINE_ROUNDUP(p->n_rules * sizeof(struct rte_acl_rule *)); + acl_rule_memory_size = RTE_CACHE_LINE_ROUNDUP(p->n_rules * + RTE_ACL_RULE_SZ(p->n_rule_fields)); + total_size = sizeof(struct rte_table_acl) + action_table_size + + acl_rule_list_size + acl_rule_memory_size; + + acl = rte_zmalloc_socket("TABLE", total_size, RTE_CACHE_LINE_SIZE, + socket_id); + if (acl == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %u bytes for ACL table\n", + __func__, total_size); + return NULL; + } + + acl->action_table = &acl->memory[0]; + acl->acl_rule_list = + (struct rte_acl_rule **) &acl->memory[action_table_size]; + acl->acl_rule_memory = (uint8_t *) + &acl->memory[action_table_size + acl_rule_list_size]; + + /* Initialization of internal fields */ + snprintf(acl->name[0], RTE_ACL_NAMESIZE, "%s_a", p->name); + snprintf(acl->name[1], RTE_ACL_NAMESIZE, "%s_b", p->name); + acl->name_id = 1; + + acl->acl_params.name = acl->name[acl->name_id]; + acl->acl_params.socket_id = socket_id; + acl->acl_params.rule_size = RTE_ACL_RULE_SZ(p->n_rule_fields); + acl->acl_params.max_rule_num = p->n_rules; + + acl->cfg.num_categories = 1; + acl->cfg.num_fields = p->n_rule_fields; + memcpy(&acl->cfg.defs[0], &p->field_format[0], + p->n_rule_fields * sizeof(struct rte_acl_field_def)); + + acl->ctx = NULL; + + acl->n_rules = p->n_rules; + acl->entry_size = entry_size; + + return acl; +} + +static int +rte_table_acl_free(void *table) +{ + struct rte_table_acl *acl = table; + + /* Check input parameters */ + if (table == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + /* Free previously allocated resources */ + if (acl->ctx != NULL) + rte_acl_free(acl->ctx); + + rte_free(acl); + + return 0; +} + +RTE_ACL_RULE_DEF(rte_pipeline_acl_rule, RTE_ACL_MAX_FIELDS); + +static int +rte_table_acl_build(struct rte_table_acl *acl, struct rte_acl_ctx **acl_ctx) +{ + struct rte_acl_ctx *ctx = NULL; + uint32_t n_rules, i; + int status; + + /* Create low level ACL table */ + ctx = rte_acl_create(&acl->acl_params); + if (ctx == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot create low level ACL table\n", + __func__); + return -1; + } + + /* Add rules to low level ACL table */ + n_rules = 0; + for (i = 1; i < acl->n_rules; i++) { + if (acl->acl_rule_list[i] != NULL) { + status = rte_acl_add_rules(ctx, acl->acl_rule_list[i], + 1); + if (status != 0) { + RTE_LOG(ERR, TABLE, + "%s: Cannot add rule to low level ACL table\n", + __func__); + rte_acl_free(ctx); + return -1; + } + + n_rules++; + } + } + + if (n_rules == 0) { + rte_acl_free(ctx); + *acl_ctx = NULL; + return 0; + } + + /* Build low level ACl table */ + status = rte_acl_build(ctx, &acl->cfg); + if (status != 0) { + RTE_LOG(ERR, TABLE, + "%s: Cannot build the low level ACL table\n", + __func__); + rte_acl_free(ctx); + return -1; + } + + *acl_ctx = ctx; + return 0; +} + +static int +rte_table_acl_entry_add( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_acl *acl = table; + struct rte_table_acl_rule_add_params *rule = + key; + struct rte_pipeline_acl_rule acl_rule; + struct rte_acl_rule *rule_location; + struct rte_acl_ctx *ctx; + uint32_t free_pos, free_pos_valid, i; + int status; + + /* Check input parameters */ + if (table == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (key == NULL) { + RTE_LOG(ERR, TABLE, "%s: key parameter is NULL\n", __func__); + return -EINVAL; + } + if (entry == NULL) { + RTE_LOG(ERR, TABLE, "%s: entry parameter is NULL\n", __func__); + return -EINVAL; + } + if (key_found == NULL) { + RTE_LOG(ERR, TABLE, "%s: key_found parameter is NULL\n", + __func__); + return -EINVAL; + } + if (entry_ptr == NULL) { + RTE_LOG(ERR, TABLE, "%s: entry_ptr parameter is NULL\n", + __func__); + return -EINVAL; + } + if (rule->priority > RTE_ACL_MAX_PRIORITY) { + RTE_LOG(ERR, TABLE, "%s: Priority is too high\n", __func__); + return -EINVAL; + } + + /* Setup rule data structure */ + memset(&acl_rule, 0, sizeof(acl_rule)); + acl_rule.data.category_mask = 1; + acl_rule.data.priority = RTE_ACL_MAX_PRIORITY - rule->priority; + acl_rule.data.userdata = 0; /* To be set up later */ + memcpy(&acl_rule.field[0], + &rule->field_value[0], + acl->cfg.num_fields * sizeof(struct rte_acl_field)); + + /* Look to see if the rule exists already in the table */ + free_pos = 0; + free_pos_valid = 0; + for (i = 1; i < acl->n_rules; i++) { + if (acl->acl_rule_list[i] == NULL) { + if (free_pos_valid == 0) { + free_pos = i; + free_pos_valid = 1; + } + + continue; + } + + /* Compare the key fields */ + status = memcmp(&acl->acl_rule_list[i]->field[0], + &rule->field_value[0], + acl->cfg.num_fields * sizeof(struct rte_acl_field)); + + /* Rule found: update data associated with the rule */ + if (status == 0) { + *key_found = 1; + *entry_ptr = &acl->memory[i * acl->entry_size]; + memcpy(*entry_ptr, entry, acl->entry_size); + + return 0; + } + } + + /* Return if max rules */ + if (free_pos_valid == 0) { + RTE_LOG(ERR, TABLE, "%s: Max number of rules reached\n", + __func__); + return -ENOSPC; + } + + /* Add the new rule to the rule set */ + acl_rule.data.userdata = free_pos; + rule_location = (struct rte_acl_rule *) + &acl->acl_rule_memory[free_pos * acl->acl_params.rule_size]; + memcpy(rule_location, &acl_rule, acl->acl_params.rule_size); + acl->acl_rule_list[free_pos] = rule_location; + + /* Build low level ACL table */ + acl->name_id ^= 1; + acl->acl_params.name = acl->name[acl->name_id]; + status = rte_table_acl_build(acl, &ctx); + if (status != 0) { + /* Roll back changes */ + acl->acl_rule_list[free_pos] = NULL; + acl->name_id ^= 1; + + return -EINVAL; + } + + /* Commit changes */ + if (acl->ctx != NULL) + rte_acl_free(acl->ctx); + acl->ctx = ctx; + *key_found = 0; + *entry_ptr = &acl->memory[free_pos * acl->entry_size]; + memcpy(*entry_ptr, entry, acl->entry_size); + + return 0; +} + +static int +rte_table_acl_entry_delete( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_acl *acl = table; + struct rte_table_acl_rule_delete_params *rule = + key; + struct rte_acl_rule *deleted_rule = NULL; + struct rte_acl_ctx *ctx; + uint32_t pos, pos_valid, i; + int status; + + /* Check input parameters */ + if (table == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (key == NULL) { + RTE_LOG(ERR, TABLE, "%s: key parameter is NULL\n", __func__); + return -EINVAL; + } + if (key_found == NULL) { + RTE_LOG(ERR, TABLE, "%s: key_found parameter is NULL\n", + __func__); + return -EINVAL; + } + + /* Look for the rule in the table */ + pos = 0; + pos_valid = 0; + for (i = 1; i < acl->n_rules; i++) { + if (acl->acl_rule_list[i] != NULL) { + /* Compare the key fields */ + status = memcmp(&acl->acl_rule_list[i]->field[0], + &rule->field_value[0], acl->cfg.num_fields * + sizeof(struct rte_acl_field)); + + /* Rule found: remove from table */ + if (status == 0) { + pos = i; + pos_valid = 1; + + deleted_rule = acl->acl_rule_list[i]; + acl->acl_rule_list[i] = NULL; + } + } + } + + /* Return if rule not found */ + if (pos_valid == 0) { + *key_found = 0; + return 0; + } + + /* Build low level ACL table */ + acl->name_id ^= 1; + acl->acl_params.name = acl->name[acl->name_id]; + status = rte_table_acl_build(acl, &ctx); + if (status != 0) { + /* Roll back changes */ + acl->acl_rule_list[pos] = deleted_rule; + acl->name_id ^= 1; + + return -EINVAL; + } + + /* Commit changes */ + if (acl->ctx != NULL) + rte_acl_free(acl->ctx); + + acl->ctx = ctx; + *key_found = 1; + if (entry != NULL) + memcpy(entry, &acl->memory[pos * acl->entry_size], + acl->entry_size); + + return 0; +} + +static int +rte_table_acl_entry_add_bulk( + void *table, + void **keys, + void **entries, + uint32_t n_keys, + int *key_found, + void **entries_ptr) +{ + struct rte_table_acl *acl = table; + struct rte_acl_ctx *ctx; + uint32_t rule_pos[n_keys]; + uint32_t i; + int err = 0, build = 0; + int status; + + /* Check input parameters */ + if (table == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (keys == NULL) { + RTE_LOG(ERR, TABLE, "%s: keys parameter is NULL\n", __func__); + return -EINVAL; + } + if (entries == NULL) { + RTE_LOG(ERR, TABLE, "%s: entries parameter is NULL\n", __func__); + return -EINVAL; + } + if (n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: 0 rules to add\n", __func__); + return -EINVAL; + } + if (key_found == NULL) { + RTE_LOG(ERR, TABLE, "%s: key_found parameter is NULL\n", + __func__); + return -EINVAL; + } + if (entries_ptr == NULL) { + RTE_LOG(ERR, TABLE, "%s: entries_ptr parameter is NULL\n", + __func__); + return -EINVAL; + } + + /* Check input parameters in arrays */ + for (i = 0; i < n_keys; i++) { + struct rte_table_acl_rule_add_params *rule; + + if (keys[i] == NULL) { + RTE_LOG(ERR, TABLE, "%s: keys[%" PRIu32 "] parameter is NULL\n", + __func__, i); + return -EINVAL; + } + + if (entries[i] == NULL) { + RTE_LOG(ERR, TABLE, "%s: entries[%" PRIu32 "] parameter is NULL\n", + __func__, i); + return -EINVAL; + } + + rule = keys[i]; + if (rule->priority > RTE_ACL_MAX_PRIORITY) { + RTE_LOG(ERR, TABLE, "%s: Priority is too high\n", __func__); + return -EINVAL; + } + } + + memset(rule_pos, 0, n_keys * sizeof(uint32_t)); + memset(key_found, 0, n_keys * sizeof(int)); + for (i = 0; i < n_keys; i++) { + struct rte_table_acl_rule_add_params *rule = + keys[i]; + struct rte_pipeline_acl_rule acl_rule; + struct rte_acl_rule *rule_location; + uint32_t free_pos, free_pos_valid, j; + + /* Setup rule data structure */ + memset(&acl_rule, 0, sizeof(acl_rule)); + acl_rule.data.category_mask = 1; + acl_rule.data.priority = RTE_ACL_MAX_PRIORITY - rule->priority; + acl_rule.data.userdata = 0; /* To be set up later */ + memcpy(&acl_rule.field[0], + &rule->field_value[0], + acl->cfg.num_fields * sizeof(struct rte_acl_field)); + + /* Look to see if the rule exists already in the table */ + free_pos = 0; + free_pos_valid = 0; + for (j = 1; j < acl->n_rules; j++) { + if (acl->acl_rule_list[j] == NULL) { + if (free_pos_valid == 0) { + free_pos = j; + free_pos_valid = 1; + } + + continue; + } + + /* Compare the key fields */ + status = memcmp(&acl->acl_rule_list[j]->field[0], + &rule->field_value[0], + acl->cfg.num_fields * sizeof(struct rte_acl_field)); + + /* Rule found: update data associated with the rule */ + if (status == 0) { + key_found[i] = 1; + entries_ptr[i] = &acl->memory[j * acl->entry_size]; + memcpy(entries_ptr[i], entries[i], acl->entry_size); + + break; + } + } + + /* Key already in the table */ + if (key_found[i] != 0) + continue; + + /* Maximum number of rules reached */ + if (free_pos_valid == 0) { + err = 1; + break; + } + + /* Add the new rule to the rule set */ + acl_rule.data.userdata = free_pos; + rule_location = (struct rte_acl_rule *) + &acl->acl_rule_memory[free_pos * acl->acl_params.rule_size]; + memcpy(rule_location, &acl_rule, acl->acl_params.rule_size); + acl->acl_rule_list[free_pos] = rule_location; + rule_pos[i] = free_pos; + build = 1; + } + + if (err != 0) { + for (i = 0; i < n_keys; i++) { + if (rule_pos[i] == 0) + continue; + + acl->acl_rule_list[rule_pos[i]] = NULL; + } + + return -ENOSPC; + } + + if (build == 0) + return 0; + + /* Build low level ACL table */ + acl->name_id ^= 1; + acl->acl_params.name = acl->name[acl->name_id]; + status = rte_table_acl_build(acl, &ctx); + if (status != 0) { + /* Roll back changes */ + for (i = 0; i < n_keys; i++) { + if (rule_pos[i] == 0) + continue; + + acl->acl_rule_list[rule_pos[i]] = NULL; + } + acl->name_id ^= 1; + + return -EINVAL; + } + + /* Commit changes */ + if (acl->ctx != NULL) + rte_acl_free(acl->ctx); + acl->ctx = ctx; + + for (i = 0; i < n_keys; i++) { + if (rule_pos[i] == 0) + continue; + + key_found[i] = 0; + entries_ptr[i] = &acl->memory[rule_pos[i] * acl->entry_size]; + memcpy(entries_ptr[i], entries[i], acl->entry_size); + } + + return 0; +} + +static int +rte_table_acl_entry_delete_bulk( + void *table, + void **keys, + uint32_t n_keys, + int *key_found, + void **entries) +{ + struct rte_table_acl *acl = table; + struct rte_acl_rule *deleted_rules[n_keys]; + uint32_t rule_pos[n_keys]; + struct rte_acl_ctx *ctx; + uint32_t i; + int status; + int build = 0; + + /* Check input parameters */ + if (table == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (keys == NULL) { + RTE_LOG(ERR, TABLE, "%s: key parameter is NULL\n", __func__); + return -EINVAL; + } + if (n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: 0 rules to delete\n", __func__); + return -EINVAL; + } + if (key_found == NULL) { + RTE_LOG(ERR, TABLE, "%s: key_found parameter is NULL\n", + __func__); + return -EINVAL; + } + + for (i = 0; i < n_keys; i++) { + if (keys[i] == NULL) { + RTE_LOG(ERR, TABLE, "%s: keys[%" PRIu32 "] parameter is NULL\n", + __func__, i); + return -EINVAL; + } + } + + memset(deleted_rules, 0, n_keys * sizeof(struct rte_acl_rule *)); + memset(rule_pos, 0, n_keys * sizeof(uint32_t)); + for (i = 0; i < n_keys; i++) { + struct rte_table_acl_rule_delete_params *rule = + keys[i]; + uint32_t pos_valid, j; + + /* Look for the rule in the table */ + pos_valid = 0; + for (j = 1; j < acl->n_rules; j++) { + if (acl->acl_rule_list[j] == NULL) + continue; + + /* Compare the key fields */ + status = memcmp(&acl->acl_rule_list[j]->field[0], + &rule->field_value[0], + acl->cfg.num_fields * sizeof(struct rte_acl_field)); + + /* Rule found: remove from table */ + if (status == 0) { + pos_valid = 1; + + deleted_rules[i] = acl->acl_rule_list[j]; + acl->acl_rule_list[j] = NULL; + rule_pos[i] = j; + + build = 1; + } + } + + if (pos_valid == 0) { + key_found[i] = 0; + continue; + } + } + + /* Return if no changes to acl table */ + if (build == 0) { + return 0; + } + + /* Build low level ACL table */ + acl->name_id ^= 1; + acl->acl_params.name = acl->name[acl->name_id]; + status = rte_table_acl_build(acl, &ctx); + if (status != 0) { + /* Roll back changes */ + for (i = 0; i < n_keys; i++) { + if (rule_pos[i] == 0) + continue; + + acl->acl_rule_list[rule_pos[i]] = deleted_rules[i]; + } + + acl->name_id ^= 1; + + return -EINVAL; + } + + /* Commit changes */ + if (acl->ctx != NULL) + rte_acl_free(acl->ctx); + + acl->ctx = ctx; + for (i = 0; i < n_keys; i++) { + if (rule_pos[i] == 0) + continue; + + key_found[i] = 1; + if (entries != NULL && entries[i] != NULL) + memcpy(entries[i], &acl->memory[rule_pos[i] * acl->entry_size], + acl->entry_size); + } + + return 0; +} + +static int +rte_table_acl_lookup( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_acl *acl = (struct rte_table_acl *) table; + const uint8_t *pkts_data[RTE_PORT_IN_BURST_SIZE_MAX]; + uint32_t results[RTE_PORT_IN_BURST_SIZE_MAX]; + uint64_t pkts_out_mask; + uint32_t n_pkts, i, j; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_ACL_STATS_PKTS_IN_ADD(acl, n_pkts_in); + + /* Input conversion */ + for (i = 0, j = 0; i < (uint32_t)(RTE_PORT_IN_BURST_SIZE_MAX - + __builtin_clzll(pkts_mask)); i++) { + uint64_t pkt_mask = 1LLU << i; + + if (pkt_mask & pkts_mask) { + pkts_data[j] = rte_pktmbuf_mtod(pkts[i], uint8_t *); + j++; + } + } + n_pkts = j; + + /* Low-level ACL table lookup */ + if (acl->ctx != NULL) + rte_acl_classify(acl->ctx, pkts_data, results, n_pkts, 1); + else + n_pkts = 0; + + /* Output conversion */ + pkts_out_mask = 0; + for (i = 0; i < n_pkts; i++) { + uint32_t action_table_pos = results[i]; + uint32_t pkt_pos = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_pos; + + pkts_mask &= ~pkt_mask; + + if (action_table_pos != 0) { + pkts_out_mask |= pkt_mask; + entries[pkt_pos] = (void *) + &acl->memory[action_table_pos * + acl->entry_size]; + rte_prefetch0(entries[pkt_pos]); + } + } + + *lookup_hit_mask = pkts_out_mask; + RTE_TABLE_ACL_STATS_PKTS_LOOKUP_MISS(acl, n_pkts_in - __builtin_popcountll(pkts_out_mask)); + + return 0; +} + +static int +rte_table_acl_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_acl *acl = table; + + if (stats != NULL) + memcpy(stats, &acl->stats, sizeof(acl->stats)); + + if (clear) + memset(&acl->stats, 0, sizeof(acl->stats)); + + return 0; +} + +struct rte_table_ops rte_table_acl_ops = { + .f_create = rte_table_acl_create, + .f_free = rte_table_acl_free, + .f_add = rte_table_acl_entry_add, + .f_delete = rte_table_acl_entry_delete, + .f_add_bulk = rte_table_acl_entry_add_bulk, + .f_delete_bulk = rte_table_acl_entry_delete_bulk, + .f_lookup = rte_table_acl_lookup, + .f_stats = rte_table_acl_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_acl.h b/src/spdk/dpdk/lib/librte_table/rte_table_acl.h new file mode 100644 index 00000000..51681921 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_acl.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_ACL_H__ +#define __INCLUDE_RTE_TABLE_ACL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table ACL + * + * This table uses the Access Control List (ACL) algorithm to uniquely + * associate data to lookup keys. + * + * Use-cases: Firewall rule database, etc. + * + ***/ + +#include + +#include "rte_acl.h" + +#include "rte_table.h" + +/** ACL table parameters */ +struct rte_table_acl_params { + /** Name */ + const char *name; + + /** Maximum number of ACL rules in the table */ + uint32_t n_rules; + + /** Number of fields in the ACL rule specification */ + uint32_t n_rule_fields; + + /** Format specification of the fields of the ACL rule */ + struct rte_acl_field_def field_format[RTE_ACL_MAX_FIELDS]; +}; + +/** ACL rule specification for entry add operation */ +struct rte_table_acl_rule_add_params { + /** ACL rule priority, with 0 as the highest priority */ + int32_t priority; + + /** Values for the fields of the ACL rule to be added to the table */ + struct rte_acl_field field_value[RTE_ACL_MAX_FIELDS]; +}; + +/** ACL rule specification for entry delete operation */ +struct rte_table_acl_rule_delete_params { + /** Values for the fields of the ACL rule to be deleted from table */ + struct rte_acl_field field_value[RTE_ACL_MAX_FIELDS]; +}; + +/** ACL table operations */ +extern struct rte_table_ops rte_table_acl_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_array.c b/src/spdk/dpdk/lib/librte_table/rte_table_array.c new file mode 100644 index 00000000..8264c50c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_array.c @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_array.h" + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_ARRAY_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_ARRAY_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_ARRAY_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_ARRAY_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_table_array { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t entry_size; + uint32_t n_entries; + uint32_t offset; + + /* Internal fields */ + uint32_t entry_pos_mask; + + /* Internal table */ + uint8_t array[0] __rte_cache_aligned; +} __rte_cache_aligned; + +static void * +rte_table_array_create(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_array_params *p = params; + struct rte_table_array *t; + uint32_t total_cl_size, total_size; + + /* Check input parameters */ + if ((p == NULL) || + (p->n_entries == 0) || + (!rte_is_power_of_2(p->n_entries))) + return NULL; + + /* Memory allocation */ + total_cl_size = (sizeof(struct rte_table_array) + + RTE_CACHE_LINE_SIZE) / RTE_CACHE_LINE_SIZE; + total_cl_size += (p->n_entries * entry_size + + RTE_CACHE_LINE_SIZE) / RTE_CACHE_LINE_SIZE; + total_size = total_cl_size * RTE_CACHE_LINE_SIZE; + t = rte_zmalloc_socket("TABLE", total_size, RTE_CACHE_LINE_SIZE, socket_id); + if (t == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %u bytes for array table\n", + __func__, total_size); + return NULL; + } + + /* Memory initialization */ + t->entry_size = entry_size; + t->n_entries = p->n_entries; + t->offset = p->offset; + t->entry_pos_mask = t->n_entries - 1; + + return t; +} + +static int +rte_table_array_free(void *table) +{ + struct rte_table_array *t = table; + + /* Check input parameters */ + if (t == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + /* Free previously allocated resources */ + rte_free(t); + + return 0; +} + +static int +rte_table_array_entry_add( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_array *t = table; + struct rte_table_array_key *k = key; + uint8_t *table_entry; + + /* Check input parameters */ + if (table == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (key == NULL) { + RTE_LOG(ERR, TABLE, "%s: key parameter is NULL\n", __func__); + return -EINVAL; + } + if (entry == NULL) { + RTE_LOG(ERR, TABLE, "%s: entry parameter is NULL\n", __func__); + return -EINVAL; + } + if (key_found == NULL) { + RTE_LOG(ERR, TABLE, "%s: key_found parameter is NULL\n", + __func__); + return -EINVAL; + } + if (entry_ptr == NULL) { + RTE_LOG(ERR, TABLE, "%s: entry_ptr parameter is NULL\n", + __func__); + return -EINVAL; + } + + table_entry = &t->array[k->pos * t->entry_size]; + memcpy(table_entry, entry, t->entry_size); + *key_found = 1; + *entry_ptr = (void *) table_entry; + + return 0; +} + +static int +rte_table_array_lookup( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_array *t = (struct rte_table_array *) table; + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_ARRAY_STATS_PKTS_IN_ADD(t, n_pkts_in); + *lookup_hit_mask = pkts_mask; + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + uint64_t n_pkts = __builtin_popcountll(pkts_mask); + uint32_t i; + + for (i = 0; i < n_pkts; i++) { + struct rte_mbuf *pkt = pkts[i]; + uint32_t entry_pos = RTE_MBUF_METADATA_UINT32(pkt, + t->offset) & t->entry_pos_mask; + + entries[i] = (void *) &t->array[entry_pos * + t->entry_size]; + } + } else { + for ( ; pkts_mask; ) { + uint32_t pkt_index = __builtin_ctzll(pkts_mask); + uint64_t pkt_mask = 1LLU << pkt_index; + struct rte_mbuf *pkt = pkts[pkt_index]; + uint32_t entry_pos = RTE_MBUF_METADATA_UINT32(pkt, + t->offset) & t->entry_pos_mask; + + entries[pkt_index] = (void *) &t->array[entry_pos * + t->entry_size]; + pkts_mask &= ~pkt_mask; + } + } + + return 0; +} + +static int +rte_table_array_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_array *array = table; + + if (stats != NULL) + memcpy(stats, &array->stats, sizeof(array->stats)); + + if (clear) + memset(&array->stats, 0, sizeof(array->stats)); + + return 0; +} + +struct rte_table_ops rte_table_array_ops = { + .f_create = rte_table_array_create, + .f_free = rte_table_array_free, + .f_add = rte_table_array_entry_add, + .f_delete = NULL, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_array_lookup, + .f_stats = rte_table_array_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_array.h b/src/spdk/dpdk/lib/librte_table/rte_table_array.h new file mode 100644 index 00000000..b16c5dfe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_array.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_ARRAY_H__ +#define __INCLUDE_RTE_TABLE_ARRAY_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table Array + * + * Simple array indexing. Lookup key is the array entry index. + * + ***/ + +#include + +#include "rte_table.h" + +/** Array table parameters */ +struct rte_table_array_params { + /** Number of array entries. Has to be a power of two. */ + uint32_t n_entries; + + /** Byte offset within input packet meta-data where lookup key (i.e. the + array entry index) is located. */ + uint32_t offset; +}; + +/** Array table key format */ +struct rte_table_array_key { + /** Array entry index */ + uint32_t pos; +}; + +/** Array table operations */ +extern struct rte_table_ops rte_table_array_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash.h b/src/spdk/dpdk/lib/librte_table/rte_table_hash.h new file mode 100644 index 00000000..6f55bd57 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_HASH_H__ +#define __INCLUDE_RTE_TABLE_HASH_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table Hash + * + * These tables use the exact match criterion to uniquely associate data to + * lookup keys. + * + * Hash table types: + * 1. Entry add strategy on bucket full: + * a. Least Recently Used (LRU): One of the existing keys in the bucket is + * deleted and the new key is added in its place. The number of keys in + * each bucket never grows bigger than 4. The logic to pick the key to + * be dropped from the bucket is LRU. The hash table lookup operation + * maintains the order in which the keys in the same bucket are hit, so + * every time a key is hit, it becomes the new Most Recently Used (MRU) + * key, i.e. the most unlikely candidate for drop. When a key is added + * to the bucket, it also becomes the new MRU key. When a key needs to + * be picked and dropped, the most likely candidate for drop, i.e. the + * current LRU key, is always picked. The LRU logic requires maintaining + * specific data structures per each bucket. Use-cases: flow cache, etc. + * b. Extendible bucket (ext): The bucket is extended with space for 4 more + * keys. This is done by allocating additional memory at table init time, + * which is used to create a pool of free keys (the size of this pool is + * configurable and always a multiple of 4). On key add operation, the + * allocation of a group of 4 keys only happens successfully within the + * limit of free keys, otherwise the key add operation fails. On key + * delete operation, a group of 4 keys is freed back to the pool of free + * keys when the key to be deleted is the only key that was used within + * its group of 4 keys at that time. On key lookup operation, if the + * current bucket is in extended state and a match is not found in the + * first group of 4 keys, the search continues beyond the first group of + * 4 keys, potentially until all keys in this bucket are examined. The + * extendible bucket logic requires maintaining specific data structures + * per table and per each bucket. Use-cases: flow table, etc. + * 2. Key size: + * a. Configurable key size + * b. Single key size (8-byte, 16-byte or 32-byte key size) + * + ***/ +#include + +#include "rte_table.h" + +/** Hash function */ +typedef uint64_t (*rte_table_hash_op_hash)( + void *key, + void *key_mask, + uint32_t key_size, + uint64_t seed); + +/** Hash table parameters */ +struct rte_table_hash_params { + /** Name */ + const char *name; + + /** Key size (number of bytes) */ + uint32_t key_size; + + /** Byte offset within packet meta-data where the key is located */ + uint32_t key_offset; + + /** Key mask */ + uint8_t *key_mask; + + /** Number of keys */ + uint32_t n_keys; + + /** Number of buckets */ + uint32_t n_buckets; + + /** Hash function */ + rte_table_hash_op_hash f_hash; + + /** Seed value for the hash function */ + uint64_t seed; +}; + +/** Extendible bucket hash table operations */ +extern struct rte_table_ops rte_table_hash_ext_ops; +extern struct rte_table_ops rte_table_hash_key8_ext_ops; +extern struct rte_table_ops rte_table_hash_key16_ext_ops; +extern struct rte_table_ops rte_table_hash_key32_ext_ops; + +/** LRU hash table operations */ +extern struct rte_table_ops rte_table_hash_lru_ops; + +extern struct rte_table_ops rte_table_hash_key8_lru_ops; +extern struct rte_table_ops rte_table_hash_key16_lru_ops; +extern struct rte_table_ops rte_table_hash_key32_lru_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.c b/src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.c new file mode 100644 index 00000000..f0243033 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.c @@ -0,0 +1,323 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_hash_cuckoo.h" + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_HASH_CUCKOO_STATS_PKTS_IN_ADD(table, val) \ + (table->stats.n_pkts_in += val) +#define RTE_TABLE_HASH_CUCKOO_STATS_PKTS_LOOKUP_MISS(table, val) \ + (table->stats.n_pkts_lookup_miss += val) + +#else + +#define RTE_TABLE_HASH_CUCKOO_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_HASH_CUCKOO_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + + +struct rte_table_hash { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t key_size; + uint32_t entry_size; + uint32_t n_keys; + rte_hash_function f_hash; + uint32_t seed; + uint32_t key_offset; + + /* cuckoo hash table object */ + struct rte_hash *h_table; + + /* Lookup table */ + uint8_t memory[0] __rte_cache_aligned; +}; + +static int +check_params_create_hash_cuckoo(struct rte_table_hash_cuckoo_params *params) +{ + if (params == NULL) { + RTE_LOG(ERR, TABLE, "NULL Input Parameters.\n"); + return -EINVAL; + } + + if (params->name == NULL) { + RTE_LOG(ERR, TABLE, "Table name is NULL.\n"); + return -EINVAL; + } + + if (params->key_size == 0) { + RTE_LOG(ERR, TABLE, "Invalid key_size.\n"); + return -EINVAL; + } + + if (params->n_keys == 0) { + RTE_LOG(ERR, TABLE, "Invalid n_keys.\n"); + return -EINVAL; + } + + if (params->f_hash == NULL) { + RTE_LOG(ERR, TABLE, "f_hash is NULL.\n"); + return -EINVAL; + } + + return 0; +} + +static void * +rte_table_hash_cuckoo_create(void *params, + int socket_id, + uint32_t entry_size) +{ + struct rte_table_hash_cuckoo_params *p = params; + struct rte_hash *h_table; + struct rte_table_hash *t; + uint32_t total_size; + + /* Check input parameters */ + if (check_params_create_hash_cuckoo(params)) + return NULL; + + /* Memory allocation */ + total_size = sizeof(struct rte_table_hash) + + RTE_CACHE_LINE_ROUNDUP(p->n_keys * entry_size); + + t = rte_zmalloc_socket(p->name, total_size, RTE_CACHE_LINE_SIZE, socket_id); + if (t == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %u bytes for cuckoo hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + /* Create cuckoo hash table */ + struct rte_hash_parameters hash_cuckoo_params = { + .entries = p->n_keys, + .key_len = p->key_size, + .hash_func = p->f_hash, + .hash_func_init_val = p->seed, + .socket_id = socket_id, + .name = p->name + }; + + h_table = rte_hash_find_existing(p->name); + if (h_table == NULL) { + h_table = rte_hash_create(&hash_cuckoo_params); + if (h_table == NULL) { + RTE_LOG(ERR, TABLE, + "%s: failed to create cuckoo hash table %s\n", + __func__, p->name); + rte_free(t); + return NULL; + } + } + + /* initialize the cuckoo hash parameters */ + t->key_size = p->key_size; + t->entry_size = entry_size; + t->n_keys = p->n_keys; + t->f_hash = p->f_hash; + t->seed = p->seed; + t->key_offset = p->key_offset; + t->h_table = h_table; + + RTE_LOG(INFO, TABLE, + "%s: Cuckoo hash table %s memory footprint is %u bytes\n", + __func__, p->name, total_size); + return t; +} + +static int +rte_table_hash_cuckoo_free(void *table) { + struct rte_table_hash *t = table; + + if (table == NULL) + return -EINVAL; + + rte_hash_free(t->h_table); + rte_free(t); + + return 0; +} + +static int +rte_table_hash_cuckoo_entry_add(void *table, void *key, void *entry, + int *key_found, void **entry_ptr) +{ + struct rte_table_hash *t = table; + int pos = 0; + + /* Check input parameters */ + if ((table == NULL) || + (key == NULL) || + (entry == NULL) || + (key_found == NULL) || + (entry_ptr == NULL)) + return -EINVAL; + + /* Find Existing entries */ + pos = rte_hash_lookup(t->h_table, key); + if (pos >= 0) { + uint8_t *existing_entry; + + *key_found = 1; + existing_entry = &t->memory[pos * t->entry_size]; + memcpy(existing_entry, entry, t->entry_size); + *entry_ptr = existing_entry; + + return 0; + } + + if (pos == -ENOENT) { + /* Entry not found. Adding new entry */ + uint8_t *new_entry; + + pos = rte_hash_add_key(t->h_table, key); + if (pos < 0) + return pos; + + new_entry = &t->memory[pos * t->entry_size]; + memcpy(new_entry, entry, t->entry_size); + + *key_found = 0; + *entry_ptr = new_entry; + return 0; + } + + return pos; +} + +static int +rte_table_hash_cuckoo_entry_delete(void *table, void *key, + int *key_found, void *entry) +{ + struct rte_table_hash *t = table; + int pos = 0; + + /* Check input parameters */ + if ((table == NULL) || + (key == NULL) || + (key_found == NULL)) + return -EINVAL; + + pos = rte_hash_del_key(t->h_table, key); + if (pos >= 0) { + *key_found = 1; + uint8_t *entry_ptr = &t->memory[pos * t->entry_size]; + + if (entry) + memcpy(entry, entry_ptr, t->entry_size); + + memset(&t->memory[pos * t->entry_size], 0, t->entry_size); + return 0; + } + + *key_found = 0; + return pos; +} + +static int +rte_table_hash_cuckoo_lookup(void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *t = table; + uint64_t pkts_mask_out = 0; + uint32_t i; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + + RTE_TABLE_HASH_CUCKOO_STATS_PKTS_IN_ADD(t, n_pkts_in); + + if ((pkts_mask & (pkts_mask + 1)) == 0) { + const uint8_t *keys[RTE_PORT_IN_BURST_SIZE_MAX]; + int32_t positions[RTE_PORT_IN_BURST_SIZE_MAX], status; + + /* Keys for bulk lookup */ + for (i = 0; i < n_pkts_in; i++) + keys[i] = RTE_MBUF_METADATA_UINT8_PTR(pkts[i], + t->key_offset); + + /* Bulk Lookup */ + status = rte_hash_lookup_bulk(t->h_table, + (const void **) keys, + n_pkts_in, + positions); + if (status == 0) { + for (i = 0; i < n_pkts_in; i++) { + if (likely(positions[i] >= 0)) { + uint64_t pkt_mask = 1LLU << i; + + entries[i] = &t->memory[positions[i] + * t->entry_size]; + pkts_mask_out |= pkt_mask; + } + } + } + } else + for (i = 0; i < (uint32_t)(RTE_PORT_IN_BURST_SIZE_MAX + - __builtin_clzll(pkts_mask)); i++) { + uint64_t pkt_mask = 1LLU << i; + + if (pkt_mask & pkts_mask) { + struct rte_mbuf *pkt = pkts[i]; + uint8_t *key = RTE_MBUF_METADATA_UINT8_PTR(pkt, + t->key_offset); + int pos; + + pos = rte_hash_lookup(t->h_table, key); + if (likely(pos >= 0)) { + entries[i] = &t->memory[pos + * t->entry_size]; + pkts_mask_out |= pkt_mask; + } + } + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_CUCKOO_STATS_PKTS_LOOKUP_MISS(t, + n_pkts_in - __builtin_popcountll(pkts_mask_out)); + + return 0; + +} + +static int +rte_table_hash_cuckoo_stats_read(void *table, struct rte_table_stats *stats, + int clear) +{ + struct rte_table_hash *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_hash_cuckoo_ops = { + .f_create = rte_table_hash_cuckoo_create, + .f_free = rte_table_hash_cuckoo_free, + .f_add = rte_table_hash_cuckoo_entry_add, + .f_delete = rte_table_hash_cuckoo_entry_delete, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_cuckoo_lookup, + .f_stats = rte_table_hash_cuckoo_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.h b/src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.h new file mode 100644 index 00000000..d9d43121 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_cuckoo.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_HASH_CUCKOO_H__ +#define __INCLUDE_RTE_TABLE_HASH_CUCKOO_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table Hash Cuckoo + * + ***/ +#include + +#include + +#include "rte_table.h" + +/** Hash table parameters */ +struct rte_table_hash_cuckoo_params { + /** Name */ + const char *name; + + /** Key size (number of bytes) */ + uint32_t key_size; + + /** Byte offset within packet meta-data where the key is located */ + uint32_t key_offset; + + /** Key mask */ + uint8_t *key_mask; + + /** Number of keys */ + uint32_t n_keys; + + /** Number of buckets */ + uint32_t n_buckets; + + /** Hash function */ + rte_hash_function f_hash; + + /** Seed value for the hash function */ + uint32_t seed; +}; + +/** Cuckoo hash table operations */ +extern struct rte_table_ops rte_table_hash_cuckoo_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_ext.c b/src/spdk/dpdk/lib/librte_table/rte_table_hash_ext.c new file mode 100644 index 00000000..802a24fe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_ext.c @@ -0,0 +1,1011 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_hash.h" + +#define KEYS_PER_BUCKET 4 + +struct bucket { + union { + uintptr_t next; + uint64_t lru_list; + }; + uint16_t sig[KEYS_PER_BUCKET]; + uint32_t key_pos[KEYS_PER_BUCKET]; +}; + +#define BUCKET_NEXT(bucket) \ + ((void *) ((bucket)->next & (~1LU))) + +#define BUCKET_NEXT_VALID(bucket) \ + ((bucket)->next & 1LU) + +#define BUCKET_NEXT_SET(bucket, bucket_next) \ +do \ + (bucket)->next = (((uintptr_t) ((void *) (bucket_next))) | 1LU);\ +while (0) + +#define BUCKET_NEXT_SET_NULL(bucket) \ +do \ + (bucket)->next = 0; \ +while (0) + +#define BUCKET_NEXT_COPY(bucket, bucket2) \ +do \ + (bucket)->next = (bucket2)->next; \ +while (0) + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_HASH_EXT_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_HASH_EXT_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_HASH_EXT_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_HASH_EXT_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct grinder { + struct bucket *bkt; + uint64_t sig; + uint64_t match; + uint32_t key_index; +}; + +struct rte_table_hash { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t key_size; + uint32_t entry_size; + uint32_t n_keys; + uint32_t n_buckets; + uint32_t n_buckets_ext; + rte_table_hash_op_hash f_hash; + uint64_t seed; + uint32_t key_offset; + + /* Internal */ + uint64_t bucket_mask; + uint32_t key_size_shl; + uint32_t data_size_shl; + uint32_t key_stack_tos; + uint32_t bkt_ext_stack_tos; + + /* Grinder */ + struct grinder grinders[RTE_PORT_IN_BURST_SIZE_MAX]; + + /* Tables */ + uint64_t *key_mask; + struct bucket *buckets; + struct bucket *buckets_ext; + uint8_t *key_mem; + uint8_t *data_mem; + uint32_t *key_stack; + uint32_t *bkt_ext_stack; + + /* Table memory */ + uint8_t memory[0] __rte_cache_aligned; +}; + +static int +keycmp(void *a, void *b, void *b_mask, uint32_t n_bytes) +{ + uint64_t *a64 = a, *b64 = b, *b_mask64 = b_mask; + uint32_t i; + + for (i = 0; i < n_bytes / sizeof(uint64_t); i++) + if (a64[i] != (b64[i] & b_mask64[i])) + return 1; + + return 0; +} + +static void +keycpy(void *dst, void *src, void *src_mask, uint32_t n_bytes) +{ + uint64_t *dst64 = dst, *src64 = src, *src_mask64 = src_mask; + uint32_t i; + + for (i = 0; i < n_bytes / sizeof(uint64_t); i++) + dst64[i] = src64[i] & src_mask64[i]; +} + +static int +check_params_create(struct rte_table_hash_params *params) +{ + /* name */ + if (params->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: name invalid value\n", __func__); + return -EINVAL; + } + + /* key_size */ + if ((params->key_size < sizeof(uint64_t)) || + (!rte_is_power_of_2(params->key_size))) { + RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__); + return -EINVAL; + } + + /* n_keys */ + if (params->n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: n_keys invalid value\n", __func__); + return -EINVAL; + } + + /* n_buckets */ + if ((params->n_buckets == 0) || + (!rte_is_power_of_2(params->n_buckets))) { + RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__); + return -EINVAL; + } + + /* f_hash */ + if (params->f_hash == NULL) { + RTE_LOG(ERR, TABLE, "%s: f_hash invalid value\n", __func__); + return -EINVAL; + } + + return 0; +} + +static void * +rte_table_hash_ext_create(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *t; + uint64_t table_meta_sz, key_mask_sz, bucket_sz, bucket_ext_sz, key_sz; + uint64_t key_stack_sz, bkt_ext_stack_sz, data_sz, total_size; + uint64_t key_mask_offset, bucket_offset, bucket_ext_offset, key_offset; + uint64_t key_stack_offset, bkt_ext_stack_offset, data_offset; + uint32_t n_buckets_ext, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + (!rte_is_power_of_2(entry_size)) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + (sizeof(struct bucket) != (RTE_CACHE_LINE_SIZE / 2))) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of bucket extensions (n_buckets_ext) so that + * it is guaranteed that n_keys keys can be stored in the table at any time. + * + * The worst case scenario takes place when all the n_keys keys fall into + * the same bucket. Actually, due to the KEYS_PER_BUCKET scheme, the worst + * case takes place when (n_keys - KEYS_PER_BUCKET + 1) keys fall into the + * same bucket, while the remaining (KEYS_PER_BUCKET - 1) keys each fall + * into a different bucket. This case defeats the purpose of the hash table. + * It indicates unsuitable f_hash or n_keys to n_buckets ratio. + * + * n_buckets_ext = n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1 + */ + n_buckets_ext = p->n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1; + + /* Memory allocation */ + table_meta_sz = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_table_hash)); + key_mask_sz = RTE_CACHE_LINE_ROUNDUP(p->key_size); + bucket_sz = RTE_CACHE_LINE_ROUNDUP(p->n_buckets * sizeof(struct bucket)); + bucket_ext_sz = + RTE_CACHE_LINE_ROUNDUP(n_buckets_ext * sizeof(struct bucket)); + key_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * p->key_size); + key_stack_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * sizeof(uint32_t)); + bkt_ext_stack_sz = + RTE_CACHE_LINE_ROUNDUP(n_buckets_ext * sizeof(uint32_t)); + data_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * entry_size); + total_size = table_meta_sz + key_mask_sz + bucket_sz + bucket_ext_sz + + key_sz + key_stack_sz + bkt_ext_stack_sz + data_sz; + + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes" + " for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + t = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (t == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes" + " for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, "%s (%u-byte key): Hash table %s memory " + "footprint is %" PRIu64 " bytes\n", + __func__, p->key_size, p->name, total_size); + + /* Memory initialization */ + t->key_size = p->key_size; + t->entry_size = entry_size; + t->n_keys = p->n_keys; + t->n_buckets = p->n_buckets; + t->n_buckets_ext = n_buckets_ext; + t->f_hash = p->f_hash; + t->seed = p->seed; + t->key_offset = p->key_offset; + + /* Internal */ + t->bucket_mask = t->n_buckets - 1; + t->key_size_shl = __builtin_ctzl(p->key_size); + t->data_size_shl = __builtin_ctzl(entry_size); + + /* Tables */ + key_mask_offset = 0; + bucket_offset = key_mask_offset + key_mask_sz; + bucket_ext_offset = bucket_offset + bucket_sz; + key_offset = bucket_ext_offset + bucket_ext_sz; + key_stack_offset = key_offset + key_sz; + bkt_ext_stack_offset = key_stack_offset + key_stack_sz; + data_offset = bkt_ext_stack_offset + bkt_ext_stack_sz; + + t->key_mask = (uint64_t *) &t->memory[key_mask_offset]; + t->buckets = (struct bucket *) &t->memory[bucket_offset]; + t->buckets_ext = (struct bucket *) &t->memory[bucket_ext_offset]; + t->key_mem = &t->memory[key_offset]; + t->key_stack = (uint32_t *) &t->memory[key_stack_offset]; + t->bkt_ext_stack = (uint32_t *) &t->memory[bkt_ext_stack_offset]; + t->data_mem = &t->memory[data_offset]; + + /* Key mask */ + if (p->key_mask == NULL) + memset(t->key_mask, 0xFF, p->key_size); + else + memcpy(t->key_mask, p->key_mask, p->key_size); + + /* Key stack */ + for (i = 0; i < t->n_keys; i++) + t->key_stack[i] = t->n_keys - 1 - i; + t->key_stack_tos = t->n_keys; + + /* Bucket ext stack */ + for (i = 0; i < t->n_buckets_ext; i++) + t->bkt_ext_stack[i] = t->n_buckets_ext - 1 - i; + t->bkt_ext_stack_tos = t->n_buckets_ext; + + return t; +} + +static int +rte_table_hash_ext_free(void *table) +{ + struct rte_table_hash *t = table; + + /* Check input parameters */ + if (t == NULL) + return -EINVAL; + + rte_free(t); + return 0; +} + +static int +rte_table_hash_ext_entry_add(void *table, void *key, void *entry, + int *key_found, void **entry_ptr) +{ + struct rte_table_hash *t = table; + struct bucket *bkt0, *bkt, *bkt_prev; + uint64_t sig; + uint32_t bkt_index, i; + + sig = t->f_hash(key, t->key_mask, t->key_size, t->seed); + bkt_index = sig & t->bucket_mask; + bkt0 = &t->buckets[bkt_index]; + sig = (sig >> 16) | 1LLU; + + /* Key is present in the bucket */ + for (bkt = bkt0; bkt != NULL; bkt = BUCKET_NEXT(bkt)) + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + uint32_t bkt_key_index = bkt->key_pos[i]; + uint8_t *bkt_key = + &t->key_mem[bkt_key_index << t->key_size_shl]; + + if ((sig == bkt_sig) && (keycmp(bkt_key, key, t->key_mask, + t->key_size) == 0)) { + uint8_t *data = &t->data_mem[bkt_key_index << + t->data_size_shl]; + + memcpy(data, entry, t->entry_size); + *key_found = 1; + *entry_ptr = (void *) data; + return 0; + } + } + + /* Key is not present in the bucket */ + for (bkt_prev = NULL, bkt = bkt0; bkt != NULL; bkt_prev = bkt, + bkt = BUCKET_NEXT(bkt)) + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + + if (bkt_sig == 0) { + uint32_t bkt_key_index; + uint8_t *bkt_key, *data; + + /* Allocate new key */ + if (t->key_stack_tos == 0) /* No free keys */ + return -ENOSPC; + + bkt_key_index = t->key_stack[ + --t->key_stack_tos]; + + /* Install new key */ + bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + data = &t->data_mem[bkt_key_index << + t->data_size_shl]; + + bkt->sig[i] = (uint16_t) sig; + bkt->key_pos[i] = bkt_key_index; + keycpy(bkt_key, key, t->key_mask, t->key_size); + memcpy(data, entry, t->entry_size); + + *key_found = 0; + *entry_ptr = (void *) data; + return 0; + } + } + + /* Bucket full: extend bucket */ + if ((t->bkt_ext_stack_tos > 0) && (t->key_stack_tos > 0)) { + uint32_t bkt_key_index; + uint8_t *bkt_key, *data; + + /* Allocate new bucket ext */ + bkt_index = t->bkt_ext_stack[--t->bkt_ext_stack_tos]; + bkt = &t->buckets_ext[bkt_index]; + + /* Chain the new bucket ext */ + BUCKET_NEXT_SET(bkt_prev, bkt); + BUCKET_NEXT_SET_NULL(bkt); + + /* Allocate new key */ + bkt_key_index = t->key_stack[--t->key_stack_tos]; + bkt_key = &t->key_mem[bkt_key_index << t->key_size_shl]; + + data = &t->data_mem[bkt_key_index << t->data_size_shl]; + + /* Install new key into bucket */ + bkt->sig[0] = (uint16_t) sig; + bkt->key_pos[0] = bkt_key_index; + keycpy(bkt_key, key, t->key_mask, t->key_size); + memcpy(data, entry, t->entry_size); + + *key_found = 0; + *entry_ptr = (void *) data; + return 0; + } + + return -ENOSPC; +} + +static int +rte_table_hash_ext_entry_delete(void *table, void *key, int *key_found, +void *entry) +{ + struct rte_table_hash *t = table; + struct bucket *bkt0, *bkt, *bkt_prev; + uint64_t sig; + uint32_t bkt_index, i; + + sig = t->f_hash(key, t->key_mask, t->key_size, t->seed); + bkt_index = sig & t->bucket_mask; + bkt0 = &t->buckets[bkt_index]; + sig = (sig >> 16) | 1LLU; + + /* Key is present in the bucket */ + for (bkt_prev = NULL, bkt = bkt0; bkt != NULL; bkt_prev = bkt, + bkt = BUCKET_NEXT(bkt)) + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + uint32_t bkt_key_index = bkt->key_pos[i]; + uint8_t *bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + + if ((sig == bkt_sig) && (keycmp(bkt_key, key, t->key_mask, + t->key_size) == 0)) { + uint8_t *data = &t->data_mem[bkt_key_index << + t->data_size_shl]; + + /* Uninstall key from bucket */ + bkt->sig[i] = 0; + *key_found = 1; + if (entry) + memcpy(entry, data, t->entry_size); + + /* Free key */ + t->key_stack[t->key_stack_tos++] = + bkt_key_index; + + /*Check if bucket is unused */ + if ((bkt_prev != NULL) && + (bkt->sig[0] == 0) && (bkt->sig[1] == 0) && + (bkt->sig[2] == 0) && (bkt->sig[3] == 0)) { + /* Unchain bucket */ + BUCKET_NEXT_COPY(bkt_prev, bkt); + + /* Clear bucket */ + memset(bkt, 0, sizeof(struct bucket)); + + /* Free bucket back to buckets ext */ + bkt_index = bkt - t->buckets_ext; + t->bkt_ext_stack[t->bkt_ext_stack_tos++] + = bkt_index; + } + + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +static int rte_table_hash_ext_lookup_unoptimized( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *t = (struct rte_table_hash *) table; + uint64_t pkts_mask_out = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + + for ( ; pkts_mask; ) { + struct bucket *bkt0, *bkt; + struct rte_mbuf *pkt; + uint8_t *key; + uint64_t pkt_mask, sig; + uint32_t pkt_index, bkt_index, i; + + pkt_index = __builtin_ctzll(pkts_mask); + pkt_mask = 1LLU << pkt_index; + pkts_mask &= ~pkt_mask; + + pkt = pkts[pkt_index]; + key = RTE_MBUF_METADATA_UINT8_PTR(pkt, t->key_offset); + sig = (uint64_t) t->f_hash(key, t->key_mask, t->key_size, t->seed); + + bkt_index = sig & t->bucket_mask; + bkt0 = &t->buckets[bkt_index]; + sig = (sig >> 16) | 1LLU; + + /* Key is present in the bucket */ + for (bkt = bkt0; bkt != NULL; bkt = BUCKET_NEXT(bkt)) + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + uint32_t bkt_key_index = bkt->key_pos[i]; + uint8_t *bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + + if ((sig == bkt_sig) && (keycmp(bkt_key, key, + t->key_mask, t->key_size) == 0)) { + uint8_t *data = &t->data_mem[ + bkt_key_index << t->data_size_shl]; + + pkts_mask_out |= pkt_mask; + entries[pkt_index] = (void *) data; + break; + } + } + } + + *lookup_hit_mask = pkts_mask_out; + return 0; +} + +/*** + * + * mask = match bitmask + * match = at least one match + * match_many = more than one match + * match_pos = position of first match + * + *---------------------------------------- + * mask match match_many match_pos + *---------------------------------------- + * 0000 0 0 00 + * 0001 1 0 00 + * 0010 1 0 01 + * 0011 1 1 00 + *---------------------------------------- + * 0100 1 0 10 + * 0101 1 1 00 + * 0110 1 1 01 + * 0111 1 1 00 + *---------------------------------------- + * 1000 1 0 11 + * 1001 1 1 00 + * 1010 1 1 01 + * 1011 1 1 00 + *---------------------------------------- + * 1100 1 1 10 + * 1101 1 1 00 + * 1110 1 1 01 + * 1111 1 1 00 + *---------------------------------------- + * + * match = 1111_1111_1111_1110 + * match_many = 1111_1110_1110_1000 + * match_pos = 0001_0010_0001_0011__0001_0010_0001_0000 + * + * match = 0xFFFELLU + * match_many = 0xFEE8LLU + * match_pos = 0x12131210LLU + * + ***/ + +#define LUT_MATCH 0xFFFELLU +#define LUT_MATCH_MANY 0xFEE8LLU +#define LUT_MATCH_POS 0x12131210LLU + +#define lookup_cmp_sig(mbuf_sig, bucket, match, match_many, match_pos) \ +{ \ + uint64_t bucket_sig[4], mask[4], mask_all; \ + \ + bucket_sig[0] = bucket->sig[0]; \ + bucket_sig[1] = bucket->sig[1]; \ + bucket_sig[2] = bucket->sig[2]; \ + bucket_sig[3] = bucket->sig[3]; \ + \ + bucket_sig[0] ^= mbuf_sig; \ + bucket_sig[1] ^= mbuf_sig; \ + bucket_sig[2] ^= mbuf_sig; \ + bucket_sig[3] ^= mbuf_sig; \ + \ + mask[0] = 0; \ + mask[1] = 0; \ + mask[2] = 0; \ + mask[3] = 0; \ + \ + if (bucket_sig[0] == 0) \ + mask[0] = 1; \ + if (bucket_sig[1] == 0) \ + mask[1] = 2; \ + if (bucket_sig[2] == 0) \ + mask[2] = 4; \ + if (bucket_sig[3] == 0) \ + mask[3] = 8; \ + \ + mask_all = (mask[0] | mask[1]) | (mask[2] | mask[3]); \ + \ + match = (LUT_MATCH >> mask_all) & 1; \ + match_many = (LUT_MATCH_MANY >> mask_all) & 1; \ + match_pos = (LUT_MATCH_POS >> (mask_all << 1)) & 3; \ +} + +#define lookup_cmp_key(mbuf, key, match_key, f) \ +{ \ + uint64_t *pkt_key = RTE_MBUF_METADATA_UINT64_PTR(mbuf, f->key_offset);\ + uint64_t *bkt_key = (uint64_t *) key; \ + uint64_t *key_mask = f->key_mask; \ + \ + switch (f->key_size) { \ + case 8: \ + { \ + uint64_t xor = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + match_key = 0; \ + if (xor == 0) \ + match_key = 1; \ + } \ + break; \ + \ + case 16: \ + { \ + uint64_t xor[2], or; \ + \ + xor[0] = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + xor[1] = (pkt_key[1] & key_mask[1]) ^ bkt_key[1]; \ + or = xor[0] | xor[1]; \ + match_key = 0; \ + if (or == 0) \ + match_key = 1; \ + } \ + break; \ + \ + case 32: \ + { \ + uint64_t xor[4], or; \ + \ + xor[0] = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + xor[1] = (pkt_key[1] & key_mask[1]) ^ bkt_key[1]; \ + xor[2] = (pkt_key[2] & key_mask[2]) ^ bkt_key[2]; \ + xor[3] = (pkt_key[3] & key_mask[3]) ^ bkt_key[3]; \ + or = xor[0] | xor[1] | xor[2] | xor[3]; \ + match_key = 0; \ + if (or == 0) \ + match_key = 1; \ + } \ + break; \ + \ + case 64: \ + { \ + uint64_t xor[8], or; \ + \ + xor[0] = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + xor[1] = (pkt_key[1] & key_mask[1]) ^ bkt_key[1]; \ + xor[2] = (pkt_key[2] & key_mask[2]) ^ bkt_key[2]; \ + xor[3] = (pkt_key[3] & key_mask[3]) ^ bkt_key[3]; \ + xor[4] = (pkt_key[4] & key_mask[4]) ^ bkt_key[4]; \ + xor[5] = (pkt_key[5] & key_mask[5]) ^ bkt_key[5]; \ + xor[6] = (pkt_key[6] & key_mask[6]) ^ bkt_key[6]; \ + xor[7] = (pkt_key[7] & key_mask[7]) ^ bkt_key[7]; \ + or = xor[0] | xor[1] | xor[2] | xor[3] | \ + xor[4] | xor[5] | xor[6] | xor[7]; \ + match_key = 0; \ + if (or == 0) \ + match_key = 1; \ + } \ + break; \ + \ + default: \ + match_key = 0; \ + if (keycmp(bkt_key, pkt_key, key_mask, f->key_size) == 0) \ + match_key = 1; \ + } \ +} + +#define lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + struct rte_mbuf *mbuf00, *mbuf01; \ + uint32_t key_offset = t->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + mbuf00 = pkts[pkt00_index]; \ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + mbuf01 = pkts[pkt01_index]; \ + \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask, pkt00_index, \ + pkt01_index) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + struct rte_mbuf *mbuf00, *mbuf01; \ + uint32_t key_offset = t->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + mbuf00 = pkts[pkt00_index]; \ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + if (pkts_mask == 0) \ + pkt01_index = pkt00_index; \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + mbuf01 = pkts[pkt01_index]; \ + \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index) \ +{ \ + struct grinder *g10, *g11; \ + uint64_t sig10, sig11, bkt10_index, bkt11_index; \ + struct rte_mbuf *mbuf10, *mbuf11; \ + struct bucket *bkt10, *bkt11, *buckets = t->buckets; \ + uint8_t *key10, *key11; \ + uint64_t bucket_mask = t->bucket_mask; \ + rte_table_hash_op_hash f_hash = t->f_hash; \ + uint64_t seed = t->seed; \ + uint32_t key_size = t->key_size; \ + uint32_t key_offset = t->key_offset; \ + \ + mbuf10 = pkts[pkt10_index]; \ + key10 = RTE_MBUF_METADATA_UINT8_PTR(mbuf10, key_offset); \ + sig10 = (uint64_t) f_hash(key10, t->key_mask, key_size, seed); \ + bkt10_index = sig10 & bucket_mask; \ + bkt10 = &buckets[bkt10_index]; \ + \ + mbuf11 = pkts[pkt11_index]; \ + key11 = RTE_MBUF_METADATA_UINT8_PTR(mbuf11, key_offset); \ + sig11 = (uint64_t) f_hash(key11, t->key_mask, key_size, seed); \ + bkt11_index = sig11 & bucket_mask; \ + bkt11 = &buckets[bkt11_index]; \ + \ + rte_prefetch0(bkt10); \ + rte_prefetch0(bkt11); \ + \ + g10 = &g[pkt10_index]; \ + g10->sig = sig10; \ + g10->bkt = bkt10; \ + \ + g11 = &g[pkt11_index]; \ + g11->sig = sig11; \ + g11->bkt = bkt11; \ +} + +#define lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many)\ +{ \ + struct grinder *g20, *g21; \ + uint64_t sig20, sig21; \ + struct bucket *bkt20, *bkt21; \ + uint8_t *key20, *key21, *key_mem = t->key_mem; \ + uint64_t match20, match21, match_many20, match_many21; \ + uint64_t match_pos20, match_pos21; \ + uint32_t key20_index, key21_index, key_size_shl = t->key_size_shl;\ + \ + g20 = &g[pkt20_index]; \ + sig20 = g20->sig; \ + bkt20 = g20->bkt; \ + sig20 = (sig20 >> 16) | 1LLU; \ + lookup_cmp_sig(sig20, bkt20, match20, match_many20, match_pos20);\ + match20 <<= pkt20_index; \ + match_many20 |= BUCKET_NEXT_VALID(bkt20); \ + match_many20 <<= pkt20_index; \ + key20_index = bkt20->key_pos[match_pos20]; \ + key20 = &key_mem[key20_index << key_size_shl]; \ + \ + g21 = &g[pkt21_index]; \ + sig21 = g21->sig; \ + bkt21 = g21->bkt; \ + sig21 = (sig21 >> 16) | 1LLU; \ + lookup_cmp_sig(sig21, bkt21, match21, match_many21, match_pos21);\ + match21 <<= pkt21_index; \ + match_many21 |= BUCKET_NEXT_VALID(bkt21); \ + match_many21 <<= pkt21_index; \ + key21_index = bkt21->key_pos[match_pos21]; \ + key21 = &key_mem[key21_index << key_size_shl]; \ + \ + rte_prefetch0(key20); \ + rte_prefetch0(key21); \ + \ + pkts_mask_match_many |= match_many20 | match_many21; \ + \ + g20->match = match20; \ + g20->key_index = key20_index; \ + \ + g21->match = match21; \ + g21->key_index = key21_index; \ +} + +#define lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, \ + entries) \ +{ \ + struct grinder *g30, *g31; \ + struct rte_mbuf *mbuf30, *mbuf31; \ + uint8_t *key30, *key31, *key_mem = t->key_mem; \ + uint8_t *data30, *data31, *data_mem = t->data_mem; \ + uint64_t match30, match31, match_key30, match_key31, match_keys;\ + uint32_t key30_index, key31_index; \ + uint32_t key_size_shl = t->key_size_shl; \ + uint32_t data_size_shl = t->data_size_shl; \ + \ + mbuf30 = pkts[pkt30_index]; \ + g30 = &g[pkt30_index]; \ + match30 = g30->match; \ + key30_index = g30->key_index; \ + key30 = &key_mem[key30_index << key_size_shl]; \ + lookup_cmp_key(mbuf30, key30, match_key30, t); \ + match_key30 <<= pkt30_index; \ + match_key30 &= match30; \ + data30 = &data_mem[key30_index << data_size_shl]; \ + entries[pkt30_index] = data30; \ + \ + mbuf31 = pkts[pkt31_index]; \ + g31 = &g[pkt31_index]; \ + match31 = g31->match; \ + key31_index = g31->key_index; \ + key31 = &key_mem[key31_index << key_size_shl]; \ + lookup_cmp_key(mbuf31, key31, match_key31, t); \ + match_key31 <<= pkt31_index; \ + match_key31 &= match31; \ + data31 = &data_mem[key31_index << data_size_shl]; \ + entries[pkt31_index] = data31; \ + \ + rte_prefetch0(data30); \ + rte_prefetch0(data31); \ + \ + match_keys = match_key30 | match_key31; \ + pkts_mask_out |= match_keys; \ +} + +/*** +* The lookup function implements a 4-stage pipeline, with each stage processing +* two different packets. The purpose of pipelined implementation is to hide the +* latency of prefetching the data structures and loosen the data dependency +* between instructions. +* +* p00 _______ p10 _______ p20 _______ p30 _______ +*----->| |----->| |----->| |----->| |-----> +* | 0 | | 1 | | 2 | | 3 | +*----->|_______|----->|_______|----->|_______|----->|_______|-----> +* p01 p11 p21 p31 +* +* The naming convention is: +* pXY = packet Y of stage X, X = 0 .. 3, Y = 0 .. 1 +* +***/ +static int rte_table_hash_ext_lookup( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *t = (struct rte_table_hash *) table; + struct grinder *g = t->grinders; + uint64_t pkt00_index, pkt01_index, pkt10_index, pkt11_index; + uint64_t pkt20_index, pkt21_index, pkt30_index, pkt31_index; + uint64_t pkts_mask_out = 0, pkts_mask_match_many = 0; + int status = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_EXT_STATS_PKTS_IN_ADD(t, n_pkts_in); + + /* Cannot run the pipeline with less than 7 packets */ + if (__builtin_popcountll(pkts_mask) < 7) { + status = rte_table_hash_ext_lookup_unoptimized(table, pkts, + pkts_mask, lookup_hit_mask, entries); + RTE_TABLE_HASH_EXT_STATS_PKTS_LOOKUP_MISS(t, n_pkts_in - + __builtin_popcountll(*lookup_hit_mask)); + return status; + } + + /* Pipeline stage 0 */ + lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index); + + /* Pipeline feed */ + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index); + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline feed */ + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index); + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask, + pkt00_index, pkt01_index); + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, + pkts_mask_match_many); + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, + pkts_mask_out, entries); + } + + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many); + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, + entries); + + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many); + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, + entries); + + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, + entries); + + /* Slow path */ + pkts_mask_match_many &= ~pkts_mask_out; + if (pkts_mask_match_many) { + uint64_t pkts_mask_out_slow = 0; + + status = rte_table_hash_ext_lookup_unoptimized(table, pkts, + pkts_mask_match_many, &pkts_mask_out_slow, entries); + pkts_mask_out |= pkts_mask_out_slow; + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_EXT_STATS_PKTS_LOOKUP_MISS(t, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return status; +} + +static int +rte_table_hash_ext_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_hash *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_hash_ext_ops = { + .f_create = rte_table_hash_ext_create, + .f_free = rte_table_hash_ext_free, + .f_add = rte_table_hash_ext_entry_add, + .f_delete = rte_table_hash_ext_entry_delete, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_ext_lookup, + .f_stats = rte_table_hash_ext_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_key16.c b/src/spdk/dpdk/lib/librte_table/rte_table_hash_key16.c new file mode 100644 index 00000000..2cca1c92 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_key16.c @@ -0,0 +1,1170 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_hash.h" +#include "rte_lru.h" + +#define KEY_SIZE 16 + +#define KEYS_PER_BUCKET 4 + +#define RTE_BUCKET_ENTRY_VALID 0x1LLU + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_HASH_KEY16_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_HASH_KEY16_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_HASH_KEY16_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_HASH_KEY16_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_bucket_4_16 { + /* Cache line 0 */ + uint64_t signature[4 + 1]; + uint64_t lru_list; + struct rte_bucket_4_16 *next; + uint64_t next_valid; + + /* Cache line 1 */ + uint64_t key[4][2]; + + /* Cache line 2 */ + uint8_t data[0]; +}; + +struct rte_table_hash { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t n_buckets; + uint32_t key_size; + uint32_t entry_size; + uint32_t bucket_size; + uint32_t key_offset; + uint64_t key_mask[2]; + rte_table_hash_op_hash f_hash; + uint64_t seed; + + /* Extendible buckets */ + uint32_t n_buckets_ext; + uint32_t stack_pos; + uint32_t *stack; + + /* Lookup table */ + uint8_t memory[0] __rte_cache_aligned; +}; + +static int +keycmp(void *a, void *b, void *b_mask) +{ + uint64_t *a64 = a, *b64 = b, *b_mask64 = b_mask; + + return (a64[0] != (b64[0] & b_mask64[0])) || + (a64[1] != (b64[1] & b_mask64[1])); +} + +static void +keycpy(void *dst, void *src, void *src_mask) +{ + uint64_t *dst64 = dst, *src64 = src, *src_mask64 = src_mask; + + dst64[0] = src64[0] & src_mask64[0]; + dst64[1] = src64[1] & src_mask64[1]; +} + +static int +check_params_create(struct rte_table_hash_params *params) +{ + /* name */ + if (params->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: name invalid value\n", __func__); + return -EINVAL; + } + + /* key_size */ + if (params->key_size != KEY_SIZE) { + RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__); + return -EINVAL; + } + + /* n_keys */ + if (params->n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: n_keys is zero\n", __func__); + return -EINVAL; + } + + /* n_buckets */ + if ((params->n_buckets == 0) || + (!rte_is_power_of_2(params->n_buckets))) { + RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__); + return -EINVAL; + } + + /* f_hash */ + if (params->f_hash == NULL) { + RTE_LOG(ERR, TABLE, "%s: f_hash function pointer is NULL\n", + __func__); + return -EINVAL; + } + + return 0; +} + +static void * +rte_table_hash_create_key16_lru(void *params, + int socket_id, + uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *f; + uint64_t bucket_size, total_size; + uint32_t n_buckets, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + ((sizeof(struct rte_bucket_4_16) % 64) != 0)) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of buckets (n_buckets) so that there a chance + * to store n_keys keys in the table. + * + * Note: Since the buckets do not get extended, it is not possible to + * guarantee that n_keys keys can be stored in the table at any time. In the + * worst case scenario when all the n_keys fall into the same bucket, only + * a maximum of KEYS_PER_BUCKET keys will be stored in the table. This case + * defeats the purpose of the hash table. It indicates unsuitable f_hash or + * n_keys to n_buckets ratio. + * + * MIN(n_buckets) = (n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET + */ + n_buckets = rte_align32pow2( + (p->n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET); + n_buckets = RTE_MAX(n_buckets, p->n_buckets); + + /* Memory allocation */ + bucket_size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_bucket_4_16) + + KEYS_PER_BUCKET * entry_size); + total_size = sizeof(struct rte_table_hash) + n_buckets * bucket_size; + + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + f = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, "%s: Hash table %s memory footprint " + "is %" PRIu64 " bytes\n", + __func__, p->name, total_size); + + /* Memory initialization */ + f->n_buckets = n_buckets; + f->key_size = KEY_SIZE; + f->entry_size = entry_size; + f->bucket_size = bucket_size; + f->key_offset = p->key_offset; + f->f_hash = p->f_hash; + f->seed = p->seed; + + if (p->key_mask != NULL) { + f->key_mask[0] = ((uint64_t *)p->key_mask)[0]; + f->key_mask[1] = ((uint64_t *)p->key_mask)[1]; + } else { + f->key_mask[0] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[1] = 0xFFFFFFFFFFFFFFFFLLU; + } + + for (i = 0; i < n_buckets; i++) { + struct rte_bucket_4_16 *bucket; + + bucket = (struct rte_bucket_4_16 *) &f->memory[i * + f->bucket_size]; + lru_init(bucket); + } + + return f; +} + +static int +rte_table_hash_free_key16_lru(void *table) +{ + struct rte_table_hash *f = table; + + /* Check input parameters */ + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + rte_free(f); + return 0; +} + +static int +rte_table_hash_entry_add_key16_lru( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_16 *bucket; + uint64_t signature, pos; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket = (struct rte_bucket_4_16 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + memcpy(bucket_data, entry, f->entry_size); + lru_update(bucket, i); + *key_found = 1; + *entry_ptr = (void *) bucket_data; + return 0; + } + } + + /* Key is not present in the bucket */ + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if (bucket_signature == 0) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + bucket->signature[i] = signature; + keycpy(bucket_key, key, f->key_mask); + memcpy(bucket_data, entry, f->entry_size); + lru_update(bucket, i); + *key_found = 0; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + + /* Bucket full: replace LRU entry */ + pos = lru_pos(bucket); + bucket->signature[pos] = signature; + keycpy(&bucket->key[pos], key, f->key_mask); + memcpy(&bucket->data[pos * f->entry_size], entry, f->entry_size); + lru_update(bucket, pos); + *key_found = 0; + *entry_ptr = (void *) &bucket->data[pos * f->entry_size]; + + return 0; +} + +static int +rte_table_hash_entry_delete_key16_lru( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_16 *bucket; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket = (struct rte_bucket_4_16 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + bucket->signature[i] = 0; + *key_found = 1; + if (entry) + memcpy(entry, bucket_data, f->entry_size); + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +static void * +rte_table_hash_create_key16_ext(void *params, + int socket_id, + uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *f; + uint64_t bucket_size, stack_size, total_size; + uint32_t n_buckets_ext, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + ((sizeof(struct rte_bucket_4_16) % 64) != 0)) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of bucket extensions (n_buckets_ext) so that + * it is guaranteed that n_keys keys can be stored in the table at any time. + * + * The worst case scenario takes place when all the n_keys keys fall into + * the same bucket. Actually, due to the KEYS_PER_BUCKET scheme, the worst + * case takes place when (n_keys - KEYS_PER_BUCKET + 1) keys fall into the + * same bucket, while the remaining (KEYS_PER_BUCKET - 1) keys each fall + * into a different bucket. This case defeats the purpose of the hash table. + * It indicates unsuitable f_hash or n_keys to n_buckets ratio. + * + * n_buckets_ext = n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1 + */ + n_buckets_ext = p->n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1; + + /* Memory allocation */ + bucket_size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_bucket_4_16) + + KEYS_PER_BUCKET * entry_size); + stack_size = RTE_CACHE_LINE_ROUNDUP(n_buckets_ext * sizeof(uint32_t)); + total_size = sizeof(struct rte_table_hash) + + (p->n_buckets + n_buckets_ext) * bucket_size + stack_size; + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + f = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, "%s: Hash table %s memory footprint " + "is %" PRIu64 " bytes\n", + __func__, p->name, total_size); + + /* Memory initialization */ + f->n_buckets = p->n_buckets; + f->key_size = KEY_SIZE; + f->entry_size = entry_size; + f->bucket_size = bucket_size; + f->key_offset = p->key_offset; + f->f_hash = p->f_hash; + f->seed = p->seed; + + f->n_buckets_ext = n_buckets_ext; + f->stack_pos = n_buckets_ext; + f->stack = (uint32_t *) + &f->memory[(p->n_buckets + n_buckets_ext) * f->bucket_size]; + + if (p->key_mask != NULL) { + f->key_mask[0] = (((uint64_t *)p->key_mask)[0]); + f->key_mask[1] = (((uint64_t *)p->key_mask)[1]); + } else { + f->key_mask[0] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[1] = 0xFFFFFFFFFFFFFFFFLLU; + } + + for (i = 0; i < n_buckets_ext; i++) + f->stack[i] = i; + + return f; +} + +static int +rte_table_hash_free_key16_ext(void *table) +{ + struct rte_table_hash *f = table; + + /* Check input parameters */ + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + rte_free(f); + return 0; +} + +static int +rte_table_hash_entry_add_key16_ext( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_16 *bucket0, *bucket, *bucket_prev; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket0 = (struct rte_bucket_4_16 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (bucket = bucket0; bucket != NULL; bucket = bucket->next) + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + memcpy(bucket_data, entry, f->entry_size); + *key_found = 1; + *entry_ptr = (void *) bucket_data; + return 0; + } + } + + /* Key is not present in the bucket */ + for (bucket_prev = NULL, bucket = bucket0; bucket != NULL; + bucket_prev = bucket, bucket = bucket->next) + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if (bucket_signature == 0) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + bucket->signature[i] = signature; + keycpy(bucket_key, key, f->key_mask); + memcpy(bucket_data, entry, f->entry_size); + *key_found = 0; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + + /* Bucket full: extend bucket */ + if (f->stack_pos > 0) { + bucket_index = f->stack[--f->stack_pos]; + + bucket = (struct rte_bucket_4_16 *) &f->memory[(f->n_buckets + + bucket_index) * f->bucket_size]; + bucket_prev->next = bucket; + bucket_prev->next_valid = 1; + + bucket->signature[0] = signature; + keycpy(&bucket->key[0], key, f->key_mask); + memcpy(&bucket->data[0], entry, f->entry_size); + *key_found = 0; + *entry_ptr = (void *) &bucket->data[0]; + return 0; + } + + return -ENOSPC; +} + +static int +rte_table_hash_entry_delete_key16_ext( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_16 *bucket0, *bucket, *bucket_prev; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket0 = (struct rte_bucket_4_16 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (bucket_prev = NULL, bucket = bucket0; bucket != NULL; + bucket_prev = bucket, bucket = bucket->next) + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + bucket->signature[i] = 0; + *key_found = 1; + if (entry) + memcpy(entry, bucket_data, f->entry_size); + + if ((bucket->signature[0] == 0) && + (bucket->signature[1] == 0) && + (bucket->signature[2] == 0) && + (bucket->signature[3] == 0) && + (bucket_prev != NULL)) { + bucket_prev->next = bucket->next; + bucket_prev->next_valid = + bucket->next_valid; + + memset(bucket, 0, + sizeof(struct rte_bucket_4_16)); + bucket_index = (((uint8_t *)bucket - + (uint8_t *)f->memory)/f->bucket_size) - f->n_buckets; + f->stack[f->stack_pos++] = bucket_index; + } + + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +#define lookup_key16_cmp(key_in, bucket, pos, f) \ +{ \ + uint64_t xor[4][2], or[4], signature[4], k[2]; \ + \ + k[0] = key_in[0] & f->key_mask[0]; \ + k[1] = key_in[1] & f->key_mask[1]; \ + signature[0] = (~bucket->signature[0]) & 1; \ + signature[1] = (~bucket->signature[1]) & 1; \ + signature[2] = (~bucket->signature[2]) & 1; \ + signature[3] = (~bucket->signature[3]) & 1; \ + \ + xor[0][0] = k[0] ^ bucket->key[0][0]; \ + xor[0][1] = k[1] ^ bucket->key[0][1]; \ + \ + xor[1][0] = k[0] ^ bucket->key[1][0]; \ + xor[1][1] = k[1] ^ bucket->key[1][1]; \ + \ + xor[2][0] = k[0] ^ bucket->key[2][0]; \ + xor[2][1] = k[1] ^ bucket->key[2][1]; \ + \ + xor[3][0] = k[0] ^ bucket->key[3][0]; \ + xor[3][1] = k[1] ^ bucket->key[3][1]; \ + \ + or[0] = xor[0][0] | xor[0][1] | signature[0]; \ + or[1] = xor[1][0] | xor[1][1] | signature[1]; \ + or[2] = xor[2][0] | xor[2][1] | signature[2]; \ + or[3] = xor[3][0] | xor[3][1] | signature[3]; \ + \ + pos = 4; \ + if (or[0] == 0) \ + pos = 0; \ + if (or[1] == 0) \ + pos = 1; \ + if (or[2] == 0) \ + pos = 2; \ + if (or[3] == 0) \ + pos = 3; \ +} + +#define lookup1_stage0(pkt0_index, mbuf0, pkts, pkts_mask, f) \ +{ \ + uint64_t pkt_mask; \ + uint32_t key_offset = f->key_offset;\ + \ + pkt0_index = __builtin_ctzll(pkts_mask); \ + pkt_mask = 1LLU << pkt0_index; \ + pkts_mask &= ~pkt_mask; \ + \ + mbuf0 = pkts[pkt0_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf0, key_offset));\ +} + +#define lookup1_stage1(mbuf1, bucket1, f) \ +{ \ + uint64_t *key; \ + uint64_t signature = 0; \ + uint32_t bucket_index; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf1, f->key_offset);\ + signature = f->f_hash(key, f->key_mask, KEY_SIZE, f->seed); \ + \ + bucket_index = signature & (f->n_buckets - 1); \ + bucket1 = (struct rte_bucket_4_16 *) \ + &f->memory[bucket_index * f->bucket_size]; \ + rte_prefetch0(bucket1); \ + rte_prefetch0((void *)(((uintptr_t) bucket1) + RTE_CACHE_LINE_SIZE));\ +} + +#define lookup1_stage2_lru(pkt2_index, mbuf2, bucket2, \ + pkts_mask_out, entries, f) \ +{ \ + void *a; \ + uint64_t pkt_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf2, f->key_offset);\ + lookup_key16_cmp(key, bucket2, pos, f); \ + \ + pkt_mask = (bucket2->signature[pos] & 1LLU) << pkt2_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket2->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt2_index] = a; \ + lru_update(bucket2, pos); \ +} + +#define lookup1_stage2_ext(pkt2_index, mbuf2, bucket2, pkts_mask_out, entries, \ + buckets_mask, buckets, keys, f) \ +{ \ + struct rte_bucket_4_16 *bucket_next; \ + void *a; \ + uint64_t pkt_mask, bucket_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf2, f->key_offset);\ + lookup_key16_cmp(key, bucket2, pos, f); \ + \ + pkt_mask = (bucket2->signature[pos] & 1LLU) << pkt2_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket2->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt2_index] = a; \ + \ + bucket_mask = (~pkt_mask) & (bucket2->next_valid << pkt2_index);\ + buckets_mask |= bucket_mask; \ + bucket_next = bucket2->next; \ + buckets[pkt2_index] = bucket_next; \ + keys[pkt2_index] = key; \ +} + +#define lookup_grinder(pkt_index, buckets, keys, pkts_mask_out, entries,\ + buckets_mask, f) \ +{ \ + struct rte_bucket_4_16 *bucket, *bucket_next; \ + void *a; \ + uint64_t pkt_mask, bucket_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + bucket = buckets[pkt_index]; \ + key = keys[pkt_index]; \ + lookup_key16_cmp(key, bucket, pos, f); \ + \ + pkt_mask = (bucket->signature[pos] & 1LLU) << pkt_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt_index] = a; \ + \ + bucket_mask = (~pkt_mask) & (bucket->next_valid << pkt_index);\ + buckets_mask |= bucket_mask; \ + bucket_next = bucket->next; \ + rte_prefetch0(bucket_next); \ + rte_prefetch0((void *)(((uintptr_t) bucket_next) + RTE_CACHE_LINE_SIZE));\ + buckets[pkt_index] = bucket_next; \ + keys[pkt_index] = key; \ +} + +#define lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01,\ + pkts, pkts_mask, f) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + \ + mbuf00 = pkts[pkt00_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + \ + mbuf01 = pkts[pkt01_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage0_with_odd_support(pkt00_index, pkt01_index,\ + mbuf00, mbuf01, pkts, pkts_mask, f) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + \ + mbuf00 = pkts[pkt00_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset)); \ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + if (pkts_mask == 0) \ + pkt01_index = pkt00_index; \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + \ + mbuf01 = pkts[pkt01_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset)); \ +} + +#define lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f) \ +{ \ + uint64_t *key10, *key11; \ + uint64_t signature10, signature11; \ + uint32_t bucket10_index, bucket11_index; \ + \ + key10 = RTE_MBUF_METADATA_UINT64_PTR(mbuf10, f->key_offset);\ + signature10 = f->f_hash(key10, f->key_mask, KEY_SIZE, f->seed);\ + bucket10_index = signature10 & (f->n_buckets - 1); \ + bucket10 = (struct rte_bucket_4_16 *) \ + &f->memory[bucket10_index * f->bucket_size]; \ + rte_prefetch0(bucket10); \ + rte_prefetch0((void *)(((uintptr_t) bucket10) + RTE_CACHE_LINE_SIZE));\ + \ + key11 = RTE_MBUF_METADATA_UINT64_PTR(mbuf11, f->key_offset);\ + signature11 = f->f_hash(key11, f->key_mask, KEY_SIZE, f->seed);\ + bucket11_index = signature11 & (f->n_buckets - 1); \ + bucket11 = (struct rte_bucket_4_16 *) \ + &f->memory[bucket11_index * f->bucket_size]; \ + rte_prefetch0(bucket11); \ + rte_prefetch0((void *)(((uintptr_t) bucket11) + RTE_CACHE_LINE_SIZE));\ +} + +#define lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21,\ + bucket20, bucket21, pkts_mask_out, entries, f) \ +{ \ + void *a20, *a21; \ + uint64_t pkt20_mask, pkt21_mask; \ + uint64_t *key20, *key21; \ + uint32_t pos20, pos21; \ + \ + key20 = RTE_MBUF_METADATA_UINT64_PTR(mbuf20, f->key_offset);\ + key21 = RTE_MBUF_METADATA_UINT64_PTR(mbuf21, f->key_offset);\ + \ + lookup_key16_cmp(key20, bucket20, pos20, f); \ + lookup_key16_cmp(key21, bucket21, pos21, f); \ + \ + pkt20_mask = (bucket20->signature[pos20] & 1LLU) << pkt20_index;\ + pkt21_mask = (bucket21->signature[pos21] & 1LLU) << pkt21_index;\ + pkts_mask_out |= pkt20_mask | pkt21_mask; \ + \ + a20 = (void *) &bucket20->data[pos20 * f->entry_size]; \ + a21 = (void *) &bucket21->data[pos21 * f->entry_size]; \ + rte_prefetch0(a20); \ + rte_prefetch0(a21); \ + entries[pkt20_index] = a20; \ + entries[pkt21_index] = a21; \ + lru_update(bucket20, pos20); \ + lru_update(bucket21, pos21); \ +} + +#define lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, bucket20, \ + bucket21, pkts_mask_out, entries, buckets_mask, buckets, keys, f) \ +{ \ + struct rte_bucket_4_16 *bucket20_next, *bucket21_next; \ + void *a20, *a21; \ + uint64_t pkt20_mask, pkt21_mask, bucket20_mask, bucket21_mask;\ + uint64_t *key20, *key21; \ + uint32_t pos20, pos21; \ + \ + key20 = RTE_MBUF_METADATA_UINT64_PTR(mbuf20, f->key_offset);\ + key21 = RTE_MBUF_METADATA_UINT64_PTR(mbuf21, f->key_offset);\ + \ + lookup_key16_cmp(key20, bucket20, pos20, f); \ + lookup_key16_cmp(key21, bucket21, pos21, f); \ + \ + pkt20_mask = (bucket20->signature[pos20] & 1LLU) << pkt20_index;\ + pkt21_mask = (bucket21->signature[pos21] & 1LLU) << pkt21_index;\ + pkts_mask_out |= pkt20_mask | pkt21_mask; \ + \ + a20 = (void *) &bucket20->data[pos20 * f->entry_size]; \ + a21 = (void *) &bucket21->data[pos21 * f->entry_size]; \ + rte_prefetch0(a20); \ + rte_prefetch0(a21); \ + entries[pkt20_index] = a20; \ + entries[pkt21_index] = a21; \ + \ + bucket20_mask = (~pkt20_mask) & (bucket20->next_valid << pkt20_index);\ + bucket21_mask = (~pkt21_mask) & (bucket21->next_valid << pkt21_index);\ + buckets_mask |= bucket20_mask | bucket21_mask; \ + bucket20_next = bucket20->next; \ + bucket21_next = bucket21->next; \ + buckets[pkt20_index] = bucket20_next; \ + buckets[pkt21_index] = bucket21_next; \ + keys[pkt20_index] = key20; \ + keys[pkt21_index] = key21; \ +} + +static int +rte_table_hash_lookup_key16_lru( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *f = (struct rte_table_hash *) table; + struct rte_bucket_4_16 *bucket10, *bucket11, *bucket20, *bucket21; + struct rte_mbuf *mbuf00, *mbuf01, *mbuf10, *mbuf11, *mbuf20, *mbuf21; + uint32_t pkt00_index, pkt01_index, pkt10_index; + uint32_t pkt11_index, pkt20_index, pkt21_index; + uint64_t pkts_mask_out = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + + RTE_TABLE_HASH_KEY16_STATS_PKTS_IN_ADD(f, n_pkts_in); + + /* Cannot run the pipeline with less than 5 packets */ + if (__builtin_popcountll(pkts_mask) < 5) { + for ( ; pkts_mask; ) { + struct rte_bucket_4_16 *bucket; + struct rte_mbuf *mbuf; + uint32_t pkt_index; + + lookup1_stage0(pkt_index, mbuf, pkts, pkts_mask, f); + lookup1_stage1(mbuf, bucket, f); + lookup1_stage2_lru(pkt_index, mbuf, bucket, + pkts_mask_out, entries, f); + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY16_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - + __builtin_popcountll(pkts_mask_out)); + return 0; + } + + /* + * Pipeline fill + * + */ + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline feed */ + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(pkt00_index, pkt01_index, + mbuf00, mbuf01, pkts, pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, f); + } + + /* + * Pipeline flush + * + */ + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, f); + + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, f); + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY16_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - + __builtin_popcountll(pkts_mask_out)); + return 0; +} /* lookup LRU */ + +static int +rte_table_hash_lookup_key16_ext( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *f = (struct rte_table_hash *) table; + struct rte_bucket_4_16 *bucket10, *bucket11, *bucket20, *bucket21; + struct rte_mbuf *mbuf00, *mbuf01, *mbuf10, *mbuf11, *mbuf20, *mbuf21; + uint32_t pkt00_index, pkt01_index, pkt10_index; + uint32_t pkt11_index, pkt20_index, pkt21_index; + uint64_t pkts_mask_out = 0, buckets_mask = 0; + struct rte_bucket_4_16 *buckets[RTE_PORT_IN_BURST_SIZE_MAX]; + uint64_t *keys[RTE_PORT_IN_BURST_SIZE_MAX]; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + + RTE_TABLE_HASH_KEY16_STATS_PKTS_IN_ADD(f, n_pkts_in); + + /* Cannot run the pipeline with less than 5 packets */ + if (__builtin_popcountll(pkts_mask) < 5) { + for ( ; pkts_mask; ) { + struct rte_bucket_4_16 *bucket; + struct rte_mbuf *mbuf; + uint32_t pkt_index; + + lookup1_stage0(pkt_index, mbuf, pkts, pkts_mask, f); + lookup1_stage1(mbuf, bucket, f); + lookup1_stage2_ext(pkt_index, mbuf, bucket, + pkts_mask_out, entries, buckets_mask, + buckets, keys, f); + } + + goto grind_next_buckets; + } + + /* + * Pipeline fill + * + */ + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline feed */ + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(pkt00_index, pkt01_index, + mbuf00, mbuf01, pkts, pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + } + + /* + * Pipeline flush + * + */ + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + +grind_next_buckets: + /* Grind next buckets */ + for ( ; buckets_mask; ) { + uint64_t buckets_mask_next = 0; + + for ( ; buckets_mask; ) { + uint64_t pkt_mask; + uint32_t pkt_index; + + pkt_index = __builtin_ctzll(buckets_mask); + pkt_mask = 1LLU << pkt_index; + buckets_mask &= ~pkt_mask; + + lookup_grinder(pkt_index, buckets, keys, pkts_mask_out, + entries, buckets_mask_next, f); + } + + buckets_mask = buckets_mask_next; + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY16_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - + __builtin_popcountll(pkts_mask_out)); + return 0; +} /* lookup EXT */ + +static int +rte_table_hash_key16_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_hash *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_hash_key16_lru_ops = { + .f_create = rte_table_hash_create_key16_lru, + .f_free = rte_table_hash_free_key16_lru, + .f_add = rte_table_hash_entry_add_key16_lru, + .f_delete = rte_table_hash_entry_delete_key16_lru, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lookup_key16_lru, + .f_stats = rte_table_hash_key16_stats_read, +}; + +struct rte_table_ops rte_table_hash_key16_ext_ops = { + .f_create = rte_table_hash_create_key16_ext, + .f_free = rte_table_hash_free_key16_ext, + .f_add = rte_table_hash_entry_add_key16_ext, + .f_delete = rte_table_hash_entry_delete_key16_ext, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lookup_key16_ext, + .f_stats = rte_table_hash_key16_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_key32.c b/src/spdk/dpdk/lib/librte_table/rte_table_hash_key32.c new file mode 100644 index 00000000..a137c502 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_key32.c @@ -0,0 +1,1203 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_hash.h" +#include "rte_lru.h" + +#define KEY_SIZE 32 + +#define KEYS_PER_BUCKET 4 + +#define RTE_BUCKET_ENTRY_VALID 0x1LLU + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_HASH_KEY32_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_HASH_KEY32_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_HASH_KEY32_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_HASH_KEY32_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_bucket_4_32 { + /* Cache line 0 */ + uint64_t signature[4 + 1]; + uint64_t lru_list; + struct rte_bucket_4_32 *next; + uint64_t next_valid; + + /* Cache lines 1 and 2 */ + uint64_t key[4][4]; + + /* Cache line 3 */ + uint8_t data[0]; +}; + +struct rte_table_hash { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t n_buckets; + uint32_t key_size; + uint32_t entry_size; + uint32_t bucket_size; + uint32_t key_offset; + uint64_t key_mask[4]; + rte_table_hash_op_hash f_hash; + uint64_t seed; + + /* Extendible buckets */ + uint32_t n_buckets_ext; + uint32_t stack_pos; + uint32_t *stack; + + /* Lookup table */ + uint8_t memory[0] __rte_cache_aligned; +}; + +static int +keycmp(void *a, void *b, void *b_mask) +{ + uint64_t *a64 = a, *b64 = b, *b_mask64 = b_mask; + + return (a64[0] != (b64[0] & b_mask64[0])) || + (a64[1] != (b64[1] & b_mask64[1])) || + (a64[2] != (b64[2] & b_mask64[2])) || + (a64[3] != (b64[3] & b_mask64[3])); +} + +static void +keycpy(void *dst, void *src, void *src_mask) +{ + uint64_t *dst64 = dst, *src64 = src, *src_mask64 = src_mask; + + dst64[0] = src64[0] & src_mask64[0]; + dst64[1] = src64[1] & src_mask64[1]; + dst64[2] = src64[2] & src_mask64[2]; + dst64[3] = src64[3] & src_mask64[3]; +} + +static int +check_params_create(struct rte_table_hash_params *params) +{ + /* name */ + if (params->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: name invalid value\n", __func__); + return -EINVAL; + } + + /* key_size */ + if (params->key_size != KEY_SIZE) { + RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__); + return -EINVAL; + } + + /* n_keys */ + if (params->n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: n_keys is zero\n", __func__); + return -EINVAL; + } + + /* n_buckets */ + if ((params->n_buckets == 0) || + (!rte_is_power_of_2(params->n_buckets))) { + RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__); + return -EINVAL; + } + + /* f_hash */ + if (params->f_hash == NULL) { + RTE_LOG(ERR, TABLE, "%s: f_hash function pointer is NULL\n", + __func__); + return -EINVAL; + } + + return 0; +} + +static void * +rte_table_hash_create_key32_lru(void *params, + int socket_id, + uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *f; + uint64_t bucket_size, total_size; + uint32_t n_buckets, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + ((sizeof(struct rte_bucket_4_32) % 64) != 0)) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of buckets (n_buckets) so that there a chance + * to store n_keys keys in the table. + * + * Note: Since the buckets do not get extended, it is not possible to + * guarantee that n_keys keys can be stored in the table at any time. In the + * worst case scenario when all the n_keys fall into the same bucket, only + * a maximum of KEYS_PER_BUCKET keys will be stored in the table. This case + * defeats the purpose of the hash table. It indicates unsuitable f_hash or + * n_keys to n_buckets ratio. + * + * MIN(n_buckets) = (n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET + */ + n_buckets = rte_align32pow2( + (p->n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET); + n_buckets = RTE_MAX(n_buckets, p->n_buckets); + + /* Memory allocation */ + bucket_size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_bucket_4_32) + + KEYS_PER_BUCKET * entry_size); + total_size = sizeof(struct rte_table_hash) + n_buckets * bucket_size; + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + f = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, + "%s: Hash table %s memory footprint " + "is %" PRIu64 " bytes\n", + __func__, p->name, total_size); + + /* Memory initialization */ + f->n_buckets = n_buckets; + f->key_size = KEY_SIZE; + f->entry_size = entry_size; + f->bucket_size = bucket_size; + f->key_offset = p->key_offset; + f->f_hash = p->f_hash; + f->seed = p->seed; + + if (p->key_mask != NULL) { + f->key_mask[0] = ((uint64_t *)p->key_mask)[0]; + f->key_mask[1] = ((uint64_t *)p->key_mask)[1]; + f->key_mask[2] = ((uint64_t *)p->key_mask)[2]; + f->key_mask[3] = ((uint64_t *)p->key_mask)[3]; + } else { + f->key_mask[0] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[1] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[2] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[3] = 0xFFFFFFFFFFFFFFFFLLU; + } + + for (i = 0; i < n_buckets; i++) { + struct rte_bucket_4_32 *bucket; + + bucket = (struct rte_bucket_4_32 *) &f->memory[i * + f->bucket_size]; + bucket->lru_list = 0x0000000100020003LLU; + } + + return f; +} + +static int +rte_table_hash_free_key32_lru(void *table) +{ + struct rte_table_hash *f = table; + + /* Check input parameters */ + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + rte_free(f); + return 0; +} + +static int +rte_table_hash_entry_add_key32_lru( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_32 *bucket; + uint64_t signature, pos; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket = (struct rte_bucket_4_32 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + memcpy(bucket_data, entry, f->entry_size); + lru_update(bucket, i); + *key_found = 1; + *entry_ptr = (void *) bucket_data; + return 0; + } + } + + /* Key is not present in the bucket */ + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if (bucket_signature == 0) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + bucket->signature[i] = signature; + keycpy(bucket_key, key, f->key_mask); + memcpy(bucket_data, entry, f->entry_size); + lru_update(bucket, i); + *key_found = 0; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + + /* Bucket full: replace LRU entry */ + pos = lru_pos(bucket); + bucket->signature[pos] = signature; + keycpy(&bucket->key[pos], key, f->key_mask); + memcpy(&bucket->data[pos * f->entry_size], entry, f->entry_size); + lru_update(bucket, pos); + *key_found = 0; + *entry_ptr = (void *) &bucket->data[pos * f->entry_size]; + + return 0; +} + +static int +rte_table_hash_entry_delete_key32_lru( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_32 *bucket; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket = (struct rte_bucket_4_32 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + bucket->signature[i] = 0; + *key_found = 1; + if (entry) + memcpy(entry, bucket_data, f->entry_size); + + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +static void * +rte_table_hash_create_key32_ext(void *params, + int socket_id, + uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *f; + uint64_t bucket_size, stack_size, total_size; + uint32_t n_buckets_ext, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + ((sizeof(struct rte_bucket_4_32) % 64) != 0)) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of bucket extensions (n_buckets_ext) so that + * it is guaranteed that n_keys keys can be stored in the table at any time. + * + * The worst case scenario takes place when all the n_keys keys fall into + * the same bucket. Actually, due to the KEYS_PER_BUCKET scheme, the worst + * case takes place when (n_keys - KEYS_PER_BUCKET + 1) keys fall into the + * same bucket, while the remaining (KEYS_PER_BUCKET - 1) keys each fall + * into a different bucket. This case defeats the purpose of the hash table. + * It indicates unsuitable f_hash or n_keys to n_buckets ratio. + * + * n_buckets_ext = n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1 + */ + n_buckets_ext = p->n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1; + + /* Memory allocation */ + bucket_size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_bucket_4_32) + + KEYS_PER_BUCKET * entry_size); + stack_size = RTE_CACHE_LINE_ROUNDUP(n_buckets_ext * sizeof(uint32_t)); + total_size = sizeof(struct rte_table_hash) + + (p->n_buckets + n_buckets_ext) * bucket_size + stack_size; + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + f = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, + "%s: Hash table %s memory footprint " + "is %" PRIu64" bytes\n", + __func__, p->name, total_size); + + /* Memory initialization */ + f->n_buckets = p->n_buckets; + f->key_size = KEY_SIZE; + f->entry_size = entry_size; + f->bucket_size = bucket_size; + f->key_offset = p->key_offset; + f->f_hash = p->f_hash; + f->seed = p->seed; + + f->n_buckets_ext = n_buckets_ext; + f->stack_pos = n_buckets_ext; + f->stack = (uint32_t *) + &f->memory[(p->n_buckets + n_buckets_ext) * f->bucket_size]; + + if (p->key_mask != NULL) { + f->key_mask[0] = (((uint64_t *)p->key_mask)[0]); + f->key_mask[1] = (((uint64_t *)p->key_mask)[1]); + f->key_mask[2] = (((uint64_t *)p->key_mask)[2]); + f->key_mask[3] = (((uint64_t *)p->key_mask)[3]); + } else { + f->key_mask[0] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[1] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[2] = 0xFFFFFFFFFFFFFFFFLLU; + f->key_mask[3] = 0xFFFFFFFFFFFFFFFFLLU; + } + + for (i = 0; i < n_buckets_ext; i++) + f->stack[i] = i; + + return f; +} + +static int +rte_table_hash_free_key32_ext(void *table) +{ + struct rte_table_hash *f = table; + + /* Check input parameters */ + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + rte_free(f); + return 0; +} + +static int +rte_table_hash_entry_add_key32_ext( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_32 *bucket0, *bucket, *bucket_prev; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket0 = (struct rte_bucket_4_32 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (bucket = bucket0; bucket != NULL; bucket = bucket->next) { + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + memcpy(bucket_data, entry, f->entry_size); + *key_found = 1; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + } + + /* Key is not present in the bucket */ + for (bucket_prev = NULL, bucket = bucket0; bucket != NULL; + bucket_prev = bucket, bucket = bucket->next) + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if (bucket_signature == 0) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + bucket->signature[i] = signature; + keycpy(bucket_key, key, f->key_mask); + memcpy(bucket_data, entry, f->entry_size); + *key_found = 0; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + + /* Bucket full: extend bucket */ + if (f->stack_pos > 0) { + bucket_index = f->stack[--f->stack_pos]; + + bucket = (struct rte_bucket_4_32 *) + &f->memory[(f->n_buckets + bucket_index) * + f->bucket_size]; + bucket_prev->next = bucket; + bucket_prev->next_valid = 1; + + bucket->signature[0] = signature; + keycpy(&bucket->key[0], key, f->key_mask); + memcpy(&bucket->data[0], entry, f->entry_size); + *key_found = 0; + *entry_ptr = (void *) &bucket->data[0]; + return 0; + } + + return -ENOSPC; +} + +static int +rte_table_hash_entry_delete_key32_ext( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_32 *bucket0, *bucket, *bucket_prev; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket0 = (struct rte_bucket_4_32 *) + &f->memory[bucket_index * f->bucket_size]; + signature |= RTE_BUCKET_ENTRY_VALID; + + /* Key is present in the bucket */ + for (bucket_prev = NULL, bucket = bucket0; bucket != NULL; + bucket_prev = bucket, bucket = bucket->next) + for (i = 0; i < 4; i++) { + uint64_t bucket_signature = bucket->signature[i]; + uint8_t *bucket_key = (uint8_t *) &bucket->key[i]; + + if ((bucket_signature == signature) && + (keycmp(bucket_key, key, f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + bucket->signature[i] = 0; + *key_found = 1; + if (entry) + memcpy(entry, bucket_data, f->entry_size); + + if ((bucket->signature[0] == 0) && + (bucket->signature[1] == 0) && + (bucket->signature[2] == 0) && + (bucket->signature[3] == 0) && + (bucket_prev != NULL)) { + bucket_prev->next = bucket->next; + bucket_prev->next_valid = + bucket->next_valid; + + memset(bucket, 0, + sizeof(struct rte_bucket_4_32)); + bucket_index = (((uint8_t *)bucket - + (uint8_t *)f->memory)/f->bucket_size) - f->n_buckets; + f->stack[f->stack_pos++] = bucket_index; + } + + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +#define lookup_key32_cmp(key_in, bucket, pos, f) \ +{ \ + uint64_t xor[4][4], or[4], signature[4], k[4]; \ + \ + k[0] = key_in[0] & f->key_mask[0]; \ + k[1] = key_in[1] & f->key_mask[1]; \ + k[2] = key_in[2] & f->key_mask[2]; \ + k[3] = key_in[3] & f->key_mask[3]; \ + \ + signature[0] = ((~bucket->signature[0]) & 1); \ + signature[1] = ((~bucket->signature[1]) & 1); \ + signature[2] = ((~bucket->signature[2]) & 1); \ + signature[3] = ((~bucket->signature[3]) & 1); \ + \ + xor[0][0] = k[0] ^ bucket->key[0][0]; \ + xor[0][1] = k[1] ^ bucket->key[0][1]; \ + xor[0][2] = k[2] ^ bucket->key[0][2]; \ + xor[0][3] = k[3] ^ bucket->key[0][3]; \ + \ + xor[1][0] = k[0] ^ bucket->key[1][0]; \ + xor[1][1] = k[1] ^ bucket->key[1][1]; \ + xor[1][2] = k[2] ^ bucket->key[1][2]; \ + xor[1][3] = k[3] ^ bucket->key[1][3]; \ + \ + xor[2][0] = k[0] ^ bucket->key[2][0]; \ + xor[2][1] = k[1] ^ bucket->key[2][1]; \ + xor[2][2] = k[2] ^ bucket->key[2][2]; \ + xor[2][3] = k[3] ^ bucket->key[2][3]; \ + \ + xor[3][0] = k[0] ^ bucket->key[3][0]; \ + xor[3][1] = k[1] ^ bucket->key[3][1]; \ + xor[3][2] = k[2] ^ bucket->key[3][2]; \ + xor[3][3] = k[3] ^ bucket->key[3][3]; \ + \ + or[0] = xor[0][0] | xor[0][1] | xor[0][2] | xor[0][3] | signature[0];\ + or[1] = xor[1][0] | xor[1][1] | xor[1][2] | xor[1][3] | signature[1];\ + or[2] = xor[2][0] | xor[2][1] | xor[2][2] | xor[2][3] | signature[2];\ + or[3] = xor[3][0] | xor[3][1] | xor[3][2] | xor[3][3] | signature[3];\ + \ + pos = 4; \ + if (or[0] == 0) \ + pos = 0; \ + if (or[1] == 0) \ + pos = 1; \ + if (or[2] == 0) \ + pos = 2; \ + if (or[3] == 0) \ + pos = 3; \ +} + +#define lookup1_stage0(pkt0_index, mbuf0, pkts, pkts_mask, f) \ +{ \ + uint64_t pkt_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt0_index = __builtin_ctzll(pkts_mask); \ + pkt_mask = 1LLU << pkt0_index; \ + pkts_mask &= ~pkt_mask; \ + \ + mbuf0 = pkts[pkt0_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf0, key_offset));\ +} + +#define lookup1_stage1(mbuf1, bucket1, f) \ +{ \ + uint64_t *key; \ + uint64_t signature; \ + uint32_t bucket_index; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf1, f->key_offset); \ + signature = f->f_hash(key, f->key_mask, KEY_SIZE, f->seed); \ + \ + bucket_index = signature & (f->n_buckets - 1); \ + bucket1 = (struct rte_bucket_4_32 *) \ + &f->memory[bucket_index * f->bucket_size]; \ + rte_prefetch0(bucket1); \ + rte_prefetch0((void *)(((uintptr_t) bucket1) + RTE_CACHE_LINE_SIZE));\ + rte_prefetch0((void *)(((uintptr_t) bucket1) + 2 * RTE_CACHE_LINE_SIZE));\ +} + +#define lookup1_stage2_lru(pkt2_index, mbuf2, bucket2, \ + pkts_mask_out, entries, f) \ +{ \ + void *a; \ + uint64_t pkt_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf2, f->key_offset);\ + lookup_key32_cmp(key, bucket2, pos, f); \ + \ + pkt_mask = (bucket2->signature[pos] & 1LLU) << pkt2_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket2->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt2_index] = a; \ + lru_update(bucket2, pos); \ +} + +#define lookup1_stage2_ext(pkt2_index, mbuf2, bucket2, pkts_mask_out,\ + entries, buckets_mask, buckets, keys, f) \ +{ \ + struct rte_bucket_4_32 *bucket_next; \ + void *a; \ + uint64_t pkt_mask, bucket_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf2, f->key_offset);\ + lookup_key32_cmp(key, bucket2, pos, f); \ + \ + pkt_mask = (bucket2->signature[pos] & 1LLU) << pkt2_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket2->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt2_index] = a; \ + \ + bucket_mask = (~pkt_mask) & (bucket2->next_valid << pkt2_index);\ + buckets_mask |= bucket_mask; \ + bucket_next = bucket2->next; \ + buckets[pkt2_index] = bucket_next; \ + keys[pkt2_index] = key; \ +} + +#define lookup_grinder(pkt_index, buckets, keys, pkts_mask_out, \ + entries, buckets_mask, f) \ +{ \ + struct rte_bucket_4_32 *bucket, *bucket_next; \ + void *a; \ + uint64_t pkt_mask, bucket_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + bucket = buckets[pkt_index]; \ + key = keys[pkt_index]; \ + \ + lookup_key32_cmp(key, bucket, pos, f); \ + \ + pkt_mask = (bucket->signature[pos] & 1LLU) << pkt_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt_index] = a; \ + \ + bucket_mask = (~pkt_mask) & (bucket->next_valid << pkt_index);\ + buckets_mask |= bucket_mask; \ + bucket_next = bucket->next; \ + rte_prefetch0(bucket_next); \ + rte_prefetch0((void *)(((uintptr_t) bucket_next) + RTE_CACHE_LINE_SIZE));\ + rte_prefetch0((void *)(((uintptr_t) bucket_next) + \ + 2 * RTE_CACHE_LINE_SIZE)); \ + buckets[pkt_index] = bucket_next; \ + keys[pkt_index] = key; \ +} + +#define lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01,\ + pkts, pkts_mask, f) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + \ + mbuf00 = pkts[pkt00_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + \ + mbuf01 = pkts[pkt01_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage0_with_odd_support(pkt00_index, pkt01_index,\ + mbuf00, mbuf01, pkts, pkts_mask, f) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + \ + mbuf00 = pkts[pkt00_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset)); \ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + if (pkts_mask == 0) \ + pkt01_index = pkt00_index; \ + \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + \ + mbuf01 = pkts[pkt01_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset)); \ +} + +#define lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f) \ +{ \ + uint64_t *key10, *key11; \ + uint64_t signature10, signature11; \ + uint32_t bucket10_index, bucket11_index; \ + \ + key10 = RTE_MBUF_METADATA_UINT64_PTR(mbuf10, f->key_offset); \ + signature10 = f->f_hash(key10, f->key_mask, KEY_SIZE, f->seed); \ + \ + bucket10_index = signature10 & (f->n_buckets - 1); \ + bucket10 = (struct rte_bucket_4_32 *) \ + &f->memory[bucket10_index * f->bucket_size]; \ + rte_prefetch0(bucket10); \ + rte_prefetch0((void *)(((uintptr_t) bucket10) + RTE_CACHE_LINE_SIZE));\ + rte_prefetch0((void *)(((uintptr_t) bucket10) + 2 * RTE_CACHE_LINE_SIZE));\ + \ + key11 = RTE_MBUF_METADATA_UINT64_PTR(mbuf11, f->key_offset); \ + signature11 = f->f_hash(key11, f->key_mask, KEY_SIZE, f->seed);\ + \ + bucket11_index = signature11 & (f->n_buckets - 1); \ + bucket11 = (struct rte_bucket_4_32 *) \ + &f->memory[bucket11_index * f->bucket_size]; \ + rte_prefetch0(bucket11); \ + rte_prefetch0((void *)(((uintptr_t) bucket11) + RTE_CACHE_LINE_SIZE));\ + rte_prefetch0((void *)(((uintptr_t) bucket11) + 2 * RTE_CACHE_LINE_SIZE));\ +} + +#define lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21,\ + bucket20, bucket21, pkts_mask_out, entries, f) \ +{ \ + void *a20, *a21; \ + uint64_t pkt20_mask, pkt21_mask; \ + uint64_t *key20, *key21; \ + uint32_t pos20, pos21; \ + \ + key20 = RTE_MBUF_METADATA_UINT64_PTR(mbuf20, f->key_offset);\ + key21 = RTE_MBUF_METADATA_UINT64_PTR(mbuf21, f->key_offset);\ + \ + lookup_key32_cmp(key20, bucket20, pos20, f); \ + lookup_key32_cmp(key21, bucket21, pos21, f); \ + \ + pkt20_mask = (bucket20->signature[pos20] & 1LLU) << pkt20_index;\ + pkt21_mask = (bucket21->signature[pos21] & 1LLU) << pkt21_index;\ + pkts_mask_out |= pkt20_mask | pkt21_mask; \ + \ + a20 = (void *) &bucket20->data[pos20 * f->entry_size]; \ + a21 = (void *) &bucket21->data[pos21 * f->entry_size]; \ + rte_prefetch0(a20); \ + rte_prefetch0(a21); \ + entries[pkt20_index] = a20; \ + entries[pkt21_index] = a21; \ + lru_update(bucket20, pos20); \ + lru_update(bucket21, pos21); \ +} + +#define lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, bucket20, \ + bucket21, pkts_mask_out, entries, buckets_mask, buckets, keys, f)\ +{ \ + struct rte_bucket_4_32 *bucket20_next, *bucket21_next; \ + void *a20, *a21; \ + uint64_t pkt20_mask, pkt21_mask, bucket20_mask, bucket21_mask;\ + uint64_t *key20, *key21; \ + uint32_t pos20, pos21; \ + \ + key20 = RTE_MBUF_METADATA_UINT64_PTR(mbuf20, f->key_offset);\ + key21 = RTE_MBUF_METADATA_UINT64_PTR(mbuf21, f->key_offset);\ + \ + lookup_key32_cmp(key20, bucket20, pos20, f); \ + lookup_key32_cmp(key21, bucket21, pos21, f); \ + \ + pkt20_mask = (bucket20->signature[pos20] & 1LLU) << pkt20_index;\ + pkt21_mask = (bucket21->signature[pos21] & 1LLU) << pkt21_index;\ + pkts_mask_out |= pkt20_mask | pkt21_mask; \ + \ + a20 = (void *) &bucket20->data[pos20 * f->entry_size]; \ + a21 = (void *) &bucket21->data[pos21 * f->entry_size]; \ + rte_prefetch0(a20); \ + rte_prefetch0(a21); \ + entries[pkt20_index] = a20; \ + entries[pkt21_index] = a21; \ + \ + bucket20_mask = (~pkt20_mask) & (bucket20->next_valid << pkt20_index);\ + bucket21_mask = (~pkt21_mask) & (bucket21->next_valid << pkt21_index);\ + buckets_mask |= bucket20_mask | bucket21_mask; \ + bucket20_next = bucket20->next; \ + bucket21_next = bucket21->next; \ + buckets[pkt20_index] = bucket20_next; \ + buckets[pkt21_index] = bucket21_next; \ + keys[pkt20_index] = key20; \ + keys[pkt21_index] = key21; \ +} + +static int +rte_table_hash_lookup_key32_lru( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *f = (struct rte_table_hash *) table; + struct rte_bucket_4_32 *bucket10, *bucket11, *bucket20, *bucket21; + struct rte_mbuf *mbuf00, *mbuf01, *mbuf10, *mbuf11, *mbuf20, *mbuf21; + uint32_t pkt00_index, pkt01_index, pkt10_index; + uint32_t pkt11_index, pkt20_index, pkt21_index; + uint64_t pkts_mask_out = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_KEY32_STATS_PKTS_IN_ADD(f, n_pkts_in); + + /* Cannot run the pipeline with less than 5 packets */ + if (__builtin_popcountll(pkts_mask) < 5) { + for ( ; pkts_mask; ) { + struct rte_bucket_4_32 *bucket; + struct rte_mbuf *mbuf; + uint32_t pkt_index; + + lookup1_stage0(pkt_index, mbuf, pkts, pkts_mask, f); + lookup1_stage1(mbuf, bucket, f); + lookup1_stage2_lru(pkt_index, mbuf, bucket, + pkts_mask_out, entries, f); + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY32_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; + } + + /* + * Pipeline fill + * + */ + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline feed */ + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(pkt00_index, pkt01_index, + mbuf00, mbuf01, pkts, pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, + mbuf20, mbuf21, bucket20, bucket21, pkts_mask_out, + entries, f); + } + + /* + * Pipeline flush + * + */ + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, + mbuf20, mbuf21, bucket20, bucket21, pkts_mask_out, entries, f); + + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, + mbuf20, mbuf21, bucket20, bucket21, pkts_mask_out, entries, f); + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY32_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; +} /* rte_table_hash_lookup_key32_lru() */ + +static int +rte_table_hash_lookup_key32_ext( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *f = (struct rte_table_hash *) table; + struct rte_bucket_4_32 *bucket10, *bucket11, *bucket20, *bucket21; + struct rte_mbuf *mbuf00, *mbuf01, *mbuf10, *mbuf11, *mbuf20, *mbuf21; + uint32_t pkt00_index, pkt01_index, pkt10_index; + uint32_t pkt11_index, pkt20_index, pkt21_index; + uint64_t pkts_mask_out = 0, buckets_mask = 0; + struct rte_bucket_4_32 *buckets[RTE_PORT_IN_BURST_SIZE_MAX]; + uint64_t *keys[RTE_PORT_IN_BURST_SIZE_MAX]; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_KEY32_STATS_PKTS_IN_ADD(f, n_pkts_in); + + /* Cannot run the pipeline with less than 5 packets */ + if (__builtin_popcountll(pkts_mask) < 5) { + for ( ; pkts_mask; ) { + struct rte_bucket_4_32 *bucket; + struct rte_mbuf *mbuf; + uint32_t pkt_index; + + lookup1_stage0(pkt_index, mbuf, pkts, pkts_mask, f); + lookup1_stage1(mbuf, bucket, f); + lookup1_stage2_ext(pkt_index, mbuf, bucket, + pkts_mask_out, entries, buckets_mask, buckets, + keys, f); + } + + goto grind_next_buckets; + } + + /* + * Pipeline fill + * + */ + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline feed */ + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(pkt00_index, pkt01_index, + mbuf00, mbuf01, pkts, pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + } + + /* + * Pipeline flush + * + */ + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + +grind_next_buckets: + /* Grind next buckets */ + for ( ; buckets_mask; ) { + uint64_t buckets_mask_next = 0; + + for ( ; buckets_mask; ) { + uint64_t pkt_mask; + uint32_t pkt_index; + + pkt_index = __builtin_ctzll(buckets_mask); + pkt_mask = 1LLU << pkt_index; + buckets_mask &= ~pkt_mask; + + lookup_grinder(pkt_index, buckets, keys, pkts_mask_out, + entries, buckets_mask_next, f); + } + + buckets_mask = buckets_mask_next; + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY32_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; +} /* rte_table_hash_lookup_key32_ext() */ + +static int +rte_table_hash_key32_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_hash *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_hash_key32_lru_ops = { + .f_create = rte_table_hash_create_key32_lru, + .f_free = rte_table_hash_free_key32_lru, + .f_add = rte_table_hash_entry_add_key32_lru, + .f_delete = rte_table_hash_entry_delete_key32_lru, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lookup_key32_lru, + .f_stats = rte_table_hash_key32_stats_read, +}; + +struct rte_table_ops rte_table_hash_key32_ext_ops = { + .f_create = rte_table_hash_create_key32_ext, + .f_free = rte_table_hash_free_key32_ext, + .f_add = rte_table_hash_entry_add_key32_ext, + .f_delete = rte_table_hash_entry_delete_key32_ext, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lookup_key32_ext, + .f_stats = rte_table_hash_key32_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_key8.c b/src/spdk/dpdk/lib/librte_table/rte_table_hash_key8.c new file mode 100644 index 00000000..1811ad8d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_key8.c @@ -0,0 +1,1138 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_hash.h" +#include "rte_lru.h" + +#define KEY_SIZE 8 + +#define KEYS_PER_BUCKET 4 + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_HASH_KEY8_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_HASH_KEY8_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_HASH_KEY8_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_HASH_KEY8_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_bucket_4_8 { + /* Cache line 0 */ + uint64_t signature; + uint64_t lru_list; + struct rte_bucket_4_8 *next; + uint64_t next_valid; + + uint64_t key[4]; + + /* Cache line 1 */ + uint8_t data[0]; +}; + +struct rte_table_hash { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t n_buckets; + uint32_t key_size; + uint32_t entry_size; + uint32_t bucket_size; + uint32_t key_offset; + uint64_t key_mask; + rte_table_hash_op_hash f_hash; + uint64_t seed; + + /* Extendible buckets */ + uint32_t n_buckets_ext; + uint32_t stack_pos; + uint32_t *stack; + + /* Lookup table */ + uint8_t memory[0] __rte_cache_aligned; +}; + +static int +keycmp(void *a, void *b, void *b_mask) +{ + uint64_t *a64 = a, *b64 = b, *b_mask64 = b_mask; + + return a64[0] != (b64[0] & b_mask64[0]); +} + +static void +keycpy(void *dst, void *src, void *src_mask) +{ + uint64_t *dst64 = dst, *src64 = src, *src_mask64 = src_mask; + + dst64[0] = src64[0] & src_mask64[0]; +} + +static int +check_params_create(struct rte_table_hash_params *params) +{ + /* name */ + if (params->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: name invalid value\n", __func__); + return -EINVAL; + } + + /* key_size */ + if (params->key_size != KEY_SIZE) { + RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__); + return -EINVAL; + } + + /* n_keys */ + if (params->n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: n_keys is zero\n", __func__); + return -EINVAL; + } + + /* n_buckets */ + if ((params->n_buckets == 0) || + (!rte_is_power_of_2(params->n_buckets))) { + RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__); + return -EINVAL; + } + + /* f_hash */ + if (params->f_hash == NULL) { + RTE_LOG(ERR, TABLE, "%s: f_hash function pointer is NULL\n", + __func__); + return -EINVAL; + } + + return 0; +} + +static void * +rte_table_hash_create_key8_lru(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *f; + uint64_t bucket_size, total_size; + uint32_t n_buckets, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + ((sizeof(struct rte_bucket_4_8) % 64) != 0)) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of buckets (n_buckets) so that there a chance + * to store n_keys keys in the table. + * + * Note: Since the buckets do not get extended, it is not possible to + * guarantee that n_keys keys can be stored in the table at any time. In the + * worst case scenario when all the n_keys fall into the same bucket, only + * a maximum of KEYS_PER_BUCKET keys will be stored in the table. This case + * defeats the purpose of the hash table. It indicates unsuitable f_hash or + * n_keys to n_buckets ratio. + * + * MIN(n_buckets) = (n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET + */ + n_buckets = rte_align32pow2( + (p->n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET); + n_buckets = RTE_MAX(n_buckets, p->n_buckets); + + /* Memory allocation */ + bucket_size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_bucket_4_8) + + KEYS_PER_BUCKET * entry_size); + total_size = sizeof(struct rte_table_hash) + n_buckets * bucket_size; + + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes" + " for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + f = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes" + " for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + RTE_LOG(INFO, TABLE, "%s: Hash table %s memory footprint " + "is %" PRIu64 " bytes\n", + __func__, p->name, total_size); + + /* Memory initialization */ + f->n_buckets = n_buckets; + f->key_size = KEY_SIZE; + f->entry_size = entry_size; + f->bucket_size = bucket_size; + f->key_offset = p->key_offset; + f->f_hash = p->f_hash; + f->seed = p->seed; + + if (p->key_mask != NULL) + f->key_mask = ((uint64_t *)p->key_mask)[0]; + else + f->key_mask = 0xFFFFFFFFFFFFFFFFLLU; + + for (i = 0; i < n_buckets; i++) { + struct rte_bucket_4_8 *bucket; + + bucket = (struct rte_bucket_4_8 *) &f->memory[i * + f->bucket_size]; + bucket->lru_list = 0x0000000100020003LLU; + } + + return f; +} + +static int +rte_table_hash_free_key8_lru(void *table) +{ + struct rte_table_hash *f = table; + + /* Check input parameters */ + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + rte_free(f); + return 0; +} + +static int +rte_table_hash_entry_add_key8_lru( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_8 *bucket; + uint64_t signature, mask, pos; + uint32_t bucket_index, i; + + signature = f->f_hash(key, &f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket = (struct rte_bucket_4_8 *) + &f->memory[bucket_index * f->bucket_size]; + + /* Key is present in the bucket */ + for (i = 0, mask = 1LLU; i < 4; i++, mask <<= 1) { + uint64_t bucket_signature = bucket->signature; + uint64_t *bucket_key = &bucket->key[i]; + + if ((bucket_signature & mask) && + (keycmp(bucket_key, key, &f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + memcpy(bucket_data, entry, f->entry_size); + lru_update(bucket, i); + *key_found = 1; + *entry_ptr = (void *) bucket_data; + return 0; + } + } + + /* Key is not present in the bucket */ + for (i = 0, mask = 1LLU; i < 4; i++, mask <<= 1) { + uint64_t bucket_signature = bucket->signature; + + if ((bucket_signature & mask) == 0) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + bucket->signature |= mask; + keycpy(&bucket->key[i], key, &f->key_mask); + memcpy(bucket_data, entry, f->entry_size); + lru_update(bucket, i); + *key_found = 0; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + + /* Bucket full: replace LRU entry */ + pos = lru_pos(bucket); + keycpy(&bucket->key[pos], key, &f->key_mask); + memcpy(&bucket->data[pos * f->entry_size], entry, f->entry_size); + lru_update(bucket, pos); + *key_found = 0; + *entry_ptr = (void *) &bucket->data[pos * f->entry_size]; + + return 0; +} + +static int +rte_table_hash_entry_delete_key8_lru( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_8 *bucket; + uint64_t signature, mask; + uint32_t bucket_index, i; + + signature = f->f_hash(key, &f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket = (struct rte_bucket_4_8 *) + &f->memory[bucket_index * f->bucket_size]; + + /* Key is present in the bucket */ + for (i = 0, mask = 1LLU; i < 4; i++, mask <<= 1) { + uint64_t bucket_signature = bucket->signature; + uint64_t *bucket_key = &bucket->key[i]; + + if ((bucket_signature & mask) && + (keycmp(bucket_key, key, &f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * f->entry_size]; + + bucket->signature &= ~mask; + *key_found = 1; + if (entry) + memcpy(entry, bucket_data, f->entry_size); + + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +static void * +rte_table_hash_create_key8_ext(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *f; + uint64_t bucket_size, stack_size, total_size; + uint32_t n_buckets_ext, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + ((sizeof(struct rte_bucket_4_8) % 64) != 0)) + return NULL; + + /* + * Table dimensioning + * + * Objective: Pick the number of bucket extensions (n_buckets_ext) so that + * it is guaranteed that n_keys keys can be stored in the table at any time. + * + * The worst case scenario takes place when all the n_keys keys fall into + * the same bucket. Actually, due to the KEYS_PER_BUCKET scheme, the worst + * case takes place when (n_keys - KEYS_PER_BUCKET + 1) keys fall into the + * same bucket, while the remaining (KEYS_PER_BUCKET - 1) keys each fall + * into a different bucket. This case defeats the purpose of the hash table. + * It indicates unsuitable f_hash or n_keys to n_buckets ratio. + * + * n_buckets_ext = n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1 + */ + n_buckets_ext = p->n_keys / KEYS_PER_BUCKET + KEYS_PER_BUCKET - 1; + + /* Memory allocation */ + bucket_size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_bucket_4_8) + + KEYS_PER_BUCKET * entry_size); + stack_size = RTE_CACHE_LINE_ROUNDUP(n_buckets_ext * sizeof(uint32_t)); + total_size = sizeof(struct rte_table_hash) + + (p->n_buckets + n_buckets_ext) * bucket_size + stack_size; + + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + + f = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (f == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %" PRIu64 " bytes " + "for hash table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, "%s: Hash table %s memory footprint " + "is %" PRIu64 " bytes\n", + __func__, p->name, total_size); + + /* Memory initialization */ + f->n_buckets = p->n_buckets; + f->key_size = KEY_SIZE; + f->entry_size = entry_size; + f->bucket_size = bucket_size; + f->key_offset = p->key_offset; + f->f_hash = p->f_hash; + f->seed = p->seed; + + f->n_buckets_ext = n_buckets_ext; + f->stack_pos = n_buckets_ext; + f->stack = (uint32_t *) + &f->memory[(p->n_buckets + n_buckets_ext) * f->bucket_size]; + + if (p->key_mask != NULL) + f->key_mask = ((uint64_t *)p->key_mask)[0]; + else + f->key_mask = 0xFFFFFFFFFFFFFFFFLLU; + + for (i = 0; i < n_buckets_ext; i++) + f->stack[i] = i; + + return f; +} + +static int +rte_table_hash_free_key8_ext(void *table) +{ + struct rte_table_hash *f = table; + + /* Check input parameters */ + if (f == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + rte_free(f); + return 0; +} + +static int +rte_table_hash_entry_add_key8_ext( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_8 *bucket0, *bucket, *bucket_prev; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, &f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket0 = (struct rte_bucket_4_8 *) + &f->memory[bucket_index * f->bucket_size]; + + /* Key is present in the bucket */ + for (bucket = bucket0; bucket != NULL; bucket = bucket->next) { + uint64_t mask; + + for (i = 0, mask = 1LLU; i < 4; i++, mask <<= 1) { + uint64_t bucket_signature = bucket->signature; + uint64_t *bucket_key = &bucket->key[i]; + + if ((bucket_signature & mask) && + (keycmp(bucket_key, key, &f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + memcpy(bucket_data, entry, f->entry_size); + *key_found = 1; + *entry_ptr = (void *) bucket_data; + return 0; + } + } + } + + /* Key is not present in the bucket */ + for (bucket_prev = NULL, bucket = bucket0; + bucket != NULL; bucket_prev = bucket, bucket = bucket->next) { + uint64_t mask; + + for (i = 0, mask = 1LLU; i < 4; i++, mask <<= 1) { + uint64_t bucket_signature = bucket->signature; + + if ((bucket_signature & mask) == 0) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + bucket->signature |= mask; + keycpy(&bucket->key[i], key, &f->key_mask); + memcpy(bucket_data, entry, f->entry_size); + *key_found = 0; + *entry_ptr = (void *) bucket_data; + + return 0; + } + } + } + + /* Bucket full: extend bucket */ + if (f->stack_pos > 0) { + bucket_index = f->stack[--f->stack_pos]; + + bucket = (struct rte_bucket_4_8 *) &f->memory[(f->n_buckets + + bucket_index) * f->bucket_size]; + bucket_prev->next = bucket; + bucket_prev->next_valid = 1; + + bucket->signature = 1; + keycpy(&bucket->key[0], key, &f->key_mask); + memcpy(&bucket->data[0], entry, f->entry_size); + *key_found = 0; + *entry_ptr = (void *) &bucket->data[0]; + return 0; + } + + return -ENOSPC; +} + +static int +rte_table_hash_entry_delete_key8_ext( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_hash *f = table; + struct rte_bucket_4_8 *bucket0, *bucket, *bucket_prev; + uint64_t signature; + uint32_t bucket_index, i; + + signature = f->f_hash(key, &f->key_mask, f->key_size, f->seed); + bucket_index = signature & (f->n_buckets - 1); + bucket0 = (struct rte_bucket_4_8 *) + &f->memory[bucket_index * f->bucket_size]; + + /* Key is present in the bucket */ + for (bucket_prev = NULL, bucket = bucket0; bucket != NULL; + bucket_prev = bucket, bucket = bucket->next) { + uint64_t mask; + + for (i = 0, mask = 1LLU; i < 4; i++, mask <<= 1) { + uint64_t bucket_signature = bucket->signature; + uint64_t *bucket_key = &bucket->key[i]; + + if ((bucket_signature & mask) && + (keycmp(bucket_key, key, &f->key_mask) == 0)) { + uint8_t *bucket_data = &bucket->data[i * + f->entry_size]; + + bucket->signature &= ~mask; + *key_found = 1; + if (entry) + memcpy(entry, bucket_data, + f->entry_size); + + if ((bucket->signature == 0) && + (bucket_prev != NULL)) { + bucket_prev->next = bucket->next; + bucket_prev->next_valid = + bucket->next_valid; + + memset(bucket, 0, + sizeof(struct rte_bucket_4_8)); + bucket_index = (((uint8_t *)bucket - + (uint8_t *)f->memory)/f->bucket_size) - f->n_buckets; + f->stack[f->stack_pos++] = bucket_index; + } + + return 0; + } + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +#define lookup_key8_cmp(key_in, bucket, pos, f) \ +{ \ + uint64_t xor[4], signature, k; \ + \ + signature = ~bucket->signature; \ + \ + k = key_in[0] & f->key_mask; \ + xor[0] = (k ^ bucket->key[0]) | (signature & 1); \ + xor[1] = (k ^ bucket->key[1]) | (signature & 2); \ + xor[2] = (k ^ bucket->key[2]) | (signature & 4); \ + xor[3] = (k ^ bucket->key[3]) | (signature & 8); \ + \ + pos = 4; \ + if (xor[0] == 0) \ + pos = 0; \ + if (xor[1] == 0) \ + pos = 1; \ + if (xor[2] == 0) \ + pos = 2; \ + if (xor[3] == 0) \ + pos = 3; \ +} + +#define lookup1_stage0(pkt0_index, mbuf0, pkts, pkts_mask, f) \ +{ \ + uint64_t pkt_mask; \ + uint32_t key_offset = f->key_offset;\ + \ + pkt0_index = __builtin_ctzll(pkts_mask); \ + pkt_mask = 1LLU << pkt0_index; \ + pkts_mask &= ~pkt_mask; \ + \ + mbuf0 = pkts[pkt0_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf0, key_offset)); \ +} + +#define lookup1_stage1(mbuf1, bucket1, f) \ +{ \ + uint64_t *key; \ + uint64_t signature; \ + uint32_t bucket_index; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf1, f->key_offset);\ + signature = f->f_hash(key, &f->key_mask, KEY_SIZE, f->seed); \ + bucket_index = signature & (f->n_buckets - 1); \ + bucket1 = (struct rte_bucket_4_8 *) \ + &f->memory[bucket_index * f->bucket_size]; \ + rte_prefetch0(bucket1); \ +} + +#define lookup1_stage2_lru(pkt2_index, mbuf2, bucket2, \ + pkts_mask_out, entries, f) \ +{ \ + void *a; \ + uint64_t pkt_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf2, f->key_offset);\ + lookup_key8_cmp(key, bucket2, pos, f); \ + \ + pkt_mask = ((bucket2->signature >> pos) & 1LLU) << pkt2_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket2->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt2_index] = a; \ + lru_update(bucket2, pos); \ +} + +#define lookup1_stage2_ext(pkt2_index, mbuf2, bucket2, pkts_mask_out,\ + entries, buckets_mask, buckets, keys, f) \ +{ \ + struct rte_bucket_4_8 *bucket_next; \ + void *a; \ + uint64_t pkt_mask, bucket_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + key = RTE_MBUF_METADATA_UINT64_PTR(mbuf2, f->key_offset);\ + lookup_key8_cmp(key, bucket2, pos, f); \ + \ + pkt_mask = ((bucket2->signature >> pos) & 1LLU) << pkt2_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket2->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt2_index] = a; \ + \ + bucket_mask = (~pkt_mask) & (bucket2->next_valid << pkt2_index);\ + buckets_mask |= bucket_mask; \ + bucket_next = bucket2->next; \ + buckets[pkt2_index] = bucket_next; \ + keys[pkt2_index] = key; \ +} + +#define lookup_grinder(pkt_index, buckets, keys, pkts_mask_out, entries,\ + buckets_mask, f) \ +{ \ + struct rte_bucket_4_8 *bucket, *bucket_next; \ + void *a; \ + uint64_t pkt_mask, bucket_mask; \ + uint64_t *key; \ + uint32_t pos; \ + \ + bucket = buckets[pkt_index]; \ + key = keys[pkt_index]; \ + lookup_key8_cmp(key, bucket, pos, f); \ + \ + pkt_mask = ((bucket->signature >> pos) & 1LLU) << pkt_index;\ + pkts_mask_out |= pkt_mask; \ + \ + a = (void *) &bucket->data[pos * f->entry_size]; \ + rte_prefetch0(a); \ + entries[pkt_index] = a; \ + \ + bucket_mask = (~pkt_mask) & (bucket->next_valid << pkt_index);\ + buckets_mask |= bucket_mask; \ + bucket_next = bucket->next; \ + rte_prefetch0(bucket_next); \ + buckets[pkt_index] = bucket_next; \ + keys[pkt_index] = key; \ +} + +#define lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01,\ + pkts, pkts_mask, f) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + \ + mbuf00 = pkts[pkt00_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + \ + mbuf01 = pkts[pkt01_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage0_with_odd_support(pkt00_index, pkt01_index,\ + mbuf00, mbuf01, pkts, pkts_mask, f) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + uint32_t key_offset = f->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + \ + mbuf00 = pkts[pkt00_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + if (pkts_mask == 0) \ + pkt01_index = pkt00_index; \ + \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + \ + mbuf01 = pkts[pkt01_index]; \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f)\ +{ \ + uint64_t *key10, *key11; \ + uint64_t signature10, signature11; \ + uint32_t bucket10_index, bucket11_index; \ + rte_table_hash_op_hash f_hash = f->f_hash; \ + uint64_t seed = f->seed; \ + uint32_t key_offset = f->key_offset; \ + \ + key10 = RTE_MBUF_METADATA_UINT64_PTR(mbuf10, key_offset);\ + key11 = RTE_MBUF_METADATA_UINT64_PTR(mbuf11, key_offset);\ + \ + signature10 = f_hash(key10, &f->key_mask, KEY_SIZE, seed); \ + bucket10_index = signature10 & (f->n_buckets - 1); \ + bucket10 = (struct rte_bucket_4_8 *) \ + &f->memory[bucket10_index * f->bucket_size]; \ + rte_prefetch0(bucket10); \ + \ + signature11 = f_hash(key11, &f->key_mask, KEY_SIZE, seed); \ + bucket11_index = signature11 & (f->n_buckets - 1); \ + bucket11 = (struct rte_bucket_4_8 *) \ + &f->memory[bucket11_index * f->bucket_size]; \ + rte_prefetch0(bucket11); \ +} + +#define lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21,\ + bucket20, bucket21, pkts_mask_out, entries, f) \ +{ \ + void *a20, *a21; \ + uint64_t pkt20_mask, pkt21_mask; \ + uint64_t *key20, *key21; \ + uint32_t pos20, pos21; \ + \ + key20 = RTE_MBUF_METADATA_UINT64_PTR(mbuf20, f->key_offset);\ + key21 = RTE_MBUF_METADATA_UINT64_PTR(mbuf21, f->key_offset);\ + \ + lookup_key8_cmp(key20, bucket20, pos20, f); \ + lookup_key8_cmp(key21, bucket21, pos21, f); \ + \ + pkt20_mask = ((bucket20->signature >> pos20) & 1LLU) << pkt20_index;\ + pkt21_mask = ((bucket21->signature >> pos21) & 1LLU) << pkt21_index;\ + pkts_mask_out |= pkt20_mask | pkt21_mask; \ + \ + a20 = (void *) &bucket20->data[pos20 * f->entry_size]; \ + a21 = (void *) &bucket21->data[pos21 * f->entry_size]; \ + rte_prefetch0(a20); \ + rte_prefetch0(a21); \ + entries[pkt20_index] = a20; \ + entries[pkt21_index] = a21; \ + lru_update(bucket20, pos20); \ + lru_update(bucket21, pos21); \ +} + +#define lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, bucket20, \ + bucket21, pkts_mask_out, entries, buckets_mask, buckets, keys, f)\ +{ \ + struct rte_bucket_4_8 *bucket20_next, *bucket21_next; \ + void *a20, *a21; \ + uint64_t pkt20_mask, pkt21_mask, bucket20_mask, bucket21_mask;\ + uint64_t *key20, *key21; \ + uint32_t pos20, pos21; \ + \ + key20 = RTE_MBUF_METADATA_UINT64_PTR(mbuf20, f->key_offset);\ + key21 = RTE_MBUF_METADATA_UINT64_PTR(mbuf21, f->key_offset);\ + \ + lookup_key8_cmp(key20, bucket20, pos20, f); \ + lookup_key8_cmp(key21, bucket21, pos21, f); \ + \ + pkt20_mask = ((bucket20->signature >> pos20) & 1LLU) << pkt20_index;\ + pkt21_mask = ((bucket21->signature >> pos21) & 1LLU) << pkt21_index;\ + pkts_mask_out |= pkt20_mask | pkt21_mask; \ + \ + a20 = (void *) &bucket20->data[pos20 * f->entry_size]; \ + a21 = (void *) &bucket21->data[pos21 * f->entry_size]; \ + rte_prefetch0(a20); \ + rte_prefetch0(a21); \ + entries[pkt20_index] = a20; \ + entries[pkt21_index] = a21; \ + \ + bucket20_mask = (~pkt20_mask) & (bucket20->next_valid << pkt20_index);\ + bucket21_mask = (~pkt21_mask) & (bucket21->next_valid << pkt21_index);\ + buckets_mask |= bucket20_mask | bucket21_mask; \ + bucket20_next = bucket20->next; \ + bucket21_next = bucket21->next; \ + buckets[pkt20_index] = bucket20_next; \ + buckets[pkt21_index] = bucket21_next; \ + keys[pkt20_index] = key20; \ + keys[pkt21_index] = key21; \ +} + +static int +rte_table_hash_lookup_key8_lru( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *f = (struct rte_table_hash *) table; + struct rte_bucket_4_8 *bucket10, *bucket11, *bucket20, *bucket21; + struct rte_mbuf *mbuf00, *mbuf01, *mbuf10, *mbuf11, *mbuf20, *mbuf21; + uint32_t pkt00_index, pkt01_index, pkt10_index; + uint32_t pkt11_index, pkt20_index, pkt21_index; + uint64_t pkts_mask_out = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_KEY8_STATS_PKTS_IN_ADD(f, n_pkts_in); + + /* Cannot run the pipeline with less than 5 packets */ + if (__builtin_popcountll(pkts_mask) < 5) { + for ( ; pkts_mask; ) { + struct rte_bucket_4_8 *bucket; + struct rte_mbuf *mbuf; + uint32_t pkt_index; + + lookup1_stage0(pkt_index, mbuf, pkts, pkts_mask, f); + lookup1_stage1(mbuf, bucket, f); + lookup1_stage2_lru(pkt_index, mbuf, bucket, + pkts_mask_out, entries, f); + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY8_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; + } + + /* + * Pipeline fill + * + */ + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline feed */ + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(pkt00_index, pkt01_index, + mbuf00, mbuf01, pkts, pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, f); + } + + /* + * Pipeline flush + * + */ + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, f); + + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2_lru(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, f); + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY8_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; +} /* lookup LRU */ + +static int +rte_table_hash_lookup_key8_ext( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *f = (struct rte_table_hash *) table; + struct rte_bucket_4_8 *bucket10, *bucket11, *bucket20, *bucket21; + struct rte_mbuf *mbuf00, *mbuf01, *mbuf10, *mbuf11, *mbuf20, *mbuf21; + uint32_t pkt00_index, pkt01_index, pkt10_index; + uint32_t pkt11_index, pkt20_index, pkt21_index; + uint64_t pkts_mask_out = 0, buckets_mask = 0; + struct rte_bucket_4_8 *buckets[RTE_PORT_IN_BURST_SIZE_MAX]; + uint64_t *keys[RTE_PORT_IN_BURST_SIZE_MAX]; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_KEY8_STATS_PKTS_IN_ADD(f, n_pkts_in); + + /* Cannot run the pipeline with less than 5 packets */ + if (__builtin_popcountll(pkts_mask) < 5) { + for ( ; pkts_mask; ) { + struct rte_bucket_4_8 *bucket; + struct rte_mbuf *mbuf; + uint32_t pkt_index; + + lookup1_stage0(pkt_index, mbuf, pkts, pkts_mask, f); + lookup1_stage1(mbuf, bucket, f); + lookup1_stage2_ext(pkt_index, mbuf, bucket, + pkts_mask_out, entries, buckets_mask, + buckets, keys, f); + } + + goto grind_next_buckets; + } + + /* + * Pipeline fill + * + */ + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline feed */ + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(pkt00_index, pkt01_index, mbuf00, mbuf01, pkts, + pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(pkt00_index, pkt01_index, + mbuf00, mbuf01, pkts, pkts_mask, f); + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + } + + /* + * Pipeline flush + * + */ + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + mbuf10 = mbuf00; + mbuf11 = mbuf01; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(mbuf10, mbuf11, bucket10, bucket11, f); + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + + /* Pipeline feed */ + bucket20 = bucket10; + bucket21 = bucket11; + mbuf20 = mbuf10; + mbuf21 = mbuf11; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2_ext(pkt20_index, pkt21_index, mbuf20, mbuf21, + bucket20, bucket21, pkts_mask_out, entries, + buckets_mask, buckets, keys, f); + +grind_next_buckets: + /* Grind next buckets */ + for ( ; buckets_mask; ) { + uint64_t buckets_mask_next = 0; + + for ( ; buckets_mask; ) { + uint64_t pkt_mask; + uint32_t pkt_index; + + pkt_index = __builtin_ctzll(buckets_mask); + pkt_mask = 1LLU << pkt_index; + buckets_mask &= ~pkt_mask; + + lookup_grinder(pkt_index, buckets, keys, pkts_mask_out, + entries, buckets_mask_next, f); + } + + buckets_mask = buckets_mask_next; + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_KEY8_STATS_PKTS_LOOKUP_MISS(f, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; +} /* lookup EXT */ + +static int +rte_table_hash_key8_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_hash *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_hash_key8_lru_ops = { + .f_create = rte_table_hash_create_key8_lru, + .f_free = rte_table_hash_free_key8_lru, + .f_add = rte_table_hash_entry_add_key8_lru, + .f_delete = rte_table_hash_entry_delete_key8_lru, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lookup_key8_lru, + .f_stats = rte_table_hash_key8_stats_read, +}; + +struct rte_table_ops rte_table_hash_key8_ext_ops = { + .f_create = rte_table_hash_create_key8_ext, + .f_free = rte_table_hash_free_key8_ext, + .f_add = rte_table_hash_entry_add_key8_ext, + .f_delete = rte_table_hash_entry_delete_key8_ext, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lookup_key8_ext, + .f_stats = rte_table_hash_key8_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_hash_lru.c b/src/spdk/dpdk/lib/librte_table/rte_table_hash_lru.c new file mode 100644 index 00000000..5bcdb7ba --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_hash_lru.c @@ -0,0 +1,959 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "rte_table_hash.h" +#include "rte_lru.h" + +#define KEYS_PER_BUCKET 4 + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_HASH_LRU_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_HASH_LRU_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_HASH_LRU_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_HASH_LRU_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct bucket { + union { + struct bucket *next; + uint64_t lru_list; + }; + uint16_t sig[KEYS_PER_BUCKET]; + uint32_t key_pos[KEYS_PER_BUCKET]; +}; + +struct grinder { + struct bucket *bkt; + uint64_t sig; + uint64_t match; + uint64_t match_pos; + uint32_t key_index; +}; + +struct rte_table_hash { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t key_size; + uint32_t entry_size; + uint32_t n_keys; + uint32_t n_buckets; + rte_table_hash_op_hash f_hash; + uint64_t seed; + uint32_t key_offset; + + /* Internal */ + uint64_t bucket_mask; + uint32_t key_size_shl; + uint32_t data_size_shl; + uint32_t key_stack_tos; + + /* Grinder */ + struct grinder grinders[RTE_PORT_IN_BURST_SIZE_MAX]; + + /* Tables */ + uint64_t *key_mask; + struct bucket *buckets; + uint8_t *key_mem; + uint8_t *data_mem; + uint32_t *key_stack; + + /* Table memory */ + uint8_t memory[0] __rte_cache_aligned; +}; + +static int +keycmp(void *a, void *b, void *b_mask, uint32_t n_bytes) +{ + uint64_t *a64 = a, *b64 = b, *b_mask64 = b_mask; + uint32_t i; + + for (i = 0; i < n_bytes / sizeof(uint64_t); i++) + if (a64[i] != (b64[i] & b_mask64[i])) + return 1; + + return 0; +} + +static void +keycpy(void *dst, void *src, void *src_mask, uint32_t n_bytes) +{ + uint64_t *dst64 = dst, *src64 = src, *src_mask64 = src_mask; + uint32_t i; + + for (i = 0; i < n_bytes / sizeof(uint64_t); i++) + dst64[i] = src64[i] & src_mask64[i]; +} + +static int +check_params_create(struct rte_table_hash_params *params) +{ + /* name */ + if (params->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: name invalid value\n", __func__); + return -EINVAL; + } + + /* key_size */ + if ((params->key_size < sizeof(uint64_t)) || + (!rte_is_power_of_2(params->key_size))) { + RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__); + return -EINVAL; + } + + /* n_keys */ + if (params->n_keys == 0) { + RTE_LOG(ERR, TABLE, "%s: n_keys invalid value\n", __func__); + return -EINVAL; + } + + /* n_buckets */ + if ((params->n_buckets == 0) || + (!rte_is_power_of_2(params->n_buckets))) { + RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__); + return -EINVAL; + } + + /* f_hash */ + if (params->f_hash == NULL) { + RTE_LOG(ERR, TABLE, "%s: f_hash invalid value\n", __func__); + return -EINVAL; + } + + return 0; +} + +static void * +rte_table_hash_lru_create(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_hash_params *p = params; + struct rte_table_hash *t; + uint64_t table_meta_sz, key_mask_sz, bucket_sz, key_sz, key_stack_sz; + uint64_t data_sz, total_size; + uint64_t key_mask_offset, bucket_offset, key_offset, key_stack_offset; + uint64_t data_offset; + uint32_t n_buckets, i; + + /* Check input parameters */ + if ((check_params_create(p) != 0) || + (!rte_is_power_of_2(entry_size)) || + ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) || + (sizeof(struct bucket) != (RTE_CACHE_LINE_SIZE / 2))) { + return NULL; + } + + /* + * Table dimensioning + * + * Objective: Pick the number of buckets (n_buckets) so that there a chance + * to store n_keys keys in the table. + * + * Note: Since the buckets do not get extended, it is not possible to + * guarantee that n_keys keys can be stored in the table at any time. In the + * worst case scenario when all the n_keys fall into the same bucket, only + * a maximum of KEYS_PER_BUCKET keys will be stored in the table. This case + * defeats the purpose of the hash table. It indicates unsuitable f_hash or + * n_keys to n_buckets ratio. + * + * MIN(n_buckets) = (n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET + */ + n_buckets = rte_align32pow2( + (p->n_keys + KEYS_PER_BUCKET - 1) / KEYS_PER_BUCKET); + n_buckets = RTE_MAX(n_buckets, p->n_buckets); + + /* Memory allocation */ + table_meta_sz = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_table_hash)); + key_mask_sz = RTE_CACHE_LINE_ROUNDUP(p->key_size); + bucket_sz = RTE_CACHE_LINE_ROUNDUP(n_buckets * sizeof(struct bucket)); + key_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * p->key_size); + key_stack_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * sizeof(uint32_t)); + data_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * entry_size); + total_size = table_meta_sz + key_mask_sz + bucket_sz + key_sz + + key_stack_sz + data_sz; + + if (total_size > SIZE_MAX) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %" PRIu64 " bytes for hash " + "table %s\n", + __func__, total_size, p->name); + return NULL; + } + + t = rte_zmalloc_socket(p->name, + (size_t)total_size, + RTE_CACHE_LINE_SIZE, + socket_id); + if (t == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %" PRIu64 " bytes for hash " + "table %s\n", + __func__, total_size, p->name); + return NULL; + } + RTE_LOG(INFO, TABLE, "%s (%u-byte key): Hash table %s memory footprint" + " is %" PRIu64 " bytes\n", + __func__, p->key_size, p->name, total_size); + + /* Memory initialization */ + t->key_size = p->key_size; + t->entry_size = entry_size; + t->n_keys = p->n_keys; + t->n_buckets = n_buckets; + t->f_hash = p->f_hash; + t->seed = p->seed; + t->key_offset = p->key_offset; + + /* Internal */ + t->bucket_mask = t->n_buckets - 1; + t->key_size_shl = __builtin_ctzl(p->key_size); + t->data_size_shl = __builtin_ctzl(entry_size); + + /* Tables */ + key_mask_offset = 0; + bucket_offset = key_mask_offset + key_mask_sz; + key_offset = bucket_offset + bucket_sz; + key_stack_offset = key_offset + key_sz; + data_offset = key_stack_offset + key_stack_sz; + + t->key_mask = (uint64_t *) &t->memory[key_mask_offset]; + t->buckets = (struct bucket *) &t->memory[bucket_offset]; + t->key_mem = &t->memory[key_offset]; + t->key_stack = (uint32_t *) &t->memory[key_stack_offset]; + t->data_mem = &t->memory[data_offset]; + + /* Key mask */ + if (p->key_mask == NULL) + memset(t->key_mask, 0xFF, p->key_size); + else + memcpy(t->key_mask, p->key_mask, p->key_size); + + /* Key stack */ + for (i = 0; i < t->n_keys; i++) + t->key_stack[i] = t->n_keys - 1 - i; + t->key_stack_tos = t->n_keys; + + /* LRU */ + for (i = 0; i < t->n_buckets; i++) { + struct bucket *bkt = &t->buckets[i]; + + lru_init(bkt); + } + + return t; +} + +static int +rte_table_hash_lru_free(void *table) +{ + struct rte_table_hash *t = table; + + /* Check input parameters */ + if (t == NULL) + return -EINVAL; + + rte_free(t); + return 0; +} + +static int +rte_table_hash_lru_entry_add(void *table, void *key, void *entry, + int *key_found, void **entry_ptr) +{ + struct rte_table_hash *t = table; + struct bucket *bkt; + uint64_t sig; + uint32_t bkt_index, i; + + sig = t->f_hash(key, t->key_mask, t->key_size, t->seed); + bkt_index = sig & t->bucket_mask; + bkt = &t->buckets[bkt_index]; + sig = (sig >> 16) | 1LLU; + + /* Key is present in the bucket */ + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + uint32_t bkt_key_index = bkt->key_pos[i]; + uint8_t *bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + + if ((sig == bkt_sig) && (keycmp(bkt_key, key, t->key_mask, + t->key_size) == 0)) { + uint8_t *data = &t->data_mem[bkt_key_index << + t->data_size_shl]; + + memcpy(data, entry, t->entry_size); + lru_update(bkt, i); + *key_found = 1; + *entry_ptr = (void *) data; + return 0; + } + } + + /* Key is not present in the bucket */ + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + + if (bkt_sig == 0) { + uint32_t bkt_key_index; + uint8_t *bkt_key, *data; + + /* Allocate new key */ + if (t->key_stack_tos == 0) { + /* No keys available */ + return -ENOSPC; + } + bkt_key_index = t->key_stack[--t->key_stack_tos]; + + /* Install new key */ + bkt_key = &t->key_mem[bkt_key_index << t->key_size_shl]; + data = &t->data_mem[bkt_key_index << t->data_size_shl]; + + bkt->sig[i] = (uint16_t) sig; + bkt->key_pos[i] = bkt_key_index; + keycpy(bkt_key, key, t->key_mask, t->key_size); + memcpy(data, entry, t->entry_size); + lru_update(bkt, i); + + *key_found = 0; + *entry_ptr = (void *) data; + return 0; + } + } + + /* Bucket full */ + { + uint64_t pos = lru_pos(bkt); + uint32_t bkt_key_index = bkt->key_pos[pos]; + uint8_t *bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + uint8_t *data = &t->data_mem[bkt_key_index << t->data_size_shl]; + + bkt->sig[pos] = (uint16_t) sig; + keycpy(bkt_key, key, t->key_mask, t->key_size); + memcpy(data, entry, t->entry_size); + lru_update(bkt, pos); + + *key_found = 0; + *entry_ptr = (void *) data; + return 0; + } +} + +static int +rte_table_hash_lru_entry_delete(void *table, void *key, int *key_found, + void *entry) +{ + struct rte_table_hash *t = table; + struct bucket *bkt; + uint64_t sig; + uint32_t bkt_index, i; + + sig = t->f_hash(key, t->key_mask, t->key_size, t->seed); + bkt_index = sig & t->bucket_mask; + bkt = &t->buckets[bkt_index]; + sig = (sig >> 16) | 1LLU; + + /* Key is present in the bucket */ + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + uint32_t bkt_key_index = bkt->key_pos[i]; + uint8_t *bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + + if ((sig == bkt_sig) && + (keycmp(bkt_key, key, t->key_mask, t->key_size) == 0)) { + uint8_t *data = &t->data_mem[bkt_key_index << + t->data_size_shl]; + + bkt->sig[i] = 0; + t->key_stack[t->key_stack_tos++] = bkt_key_index; + *key_found = 1; + if (entry) + memcpy(entry, data, t->entry_size); + return 0; + } + } + + /* Key is not present in the bucket */ + *key_found = 0; + return 0; +} + +static int rte_table_hash_lru_lookup_unoptimized( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *t = (struct rte_table_hash *) table; + uint64_t pkts_mask_out = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_LRU_STATS_PKTS_IN_ADD(t, n_pkts_in); + + for ( ; pkts_mask; ) { + struct bucket *bkt; + struct rte_mbuf *pkt; + uint8_t *key; + uint64_t pkt_mask, sig; + uint32_t pkt_index, bkt_index, i; + + pkt_index = __builtin_ctzll(pkts_mask); + pkt_mask = 1LLU << pkt_index; + pkts_mask &= ~pkt_mask; + + pkt = pkts[pkt_index]; + key = RTE_MBUF_METADATA_UINT8_PTR(pkt, t->key_offset); + sig = (uint64_t) t->f_hash(key, t->key_mask, t->key_size, t->seed); + + bkt_index = sig & t->bucket_mask; + bkt = &t->buckets[bkt_index]; + sig = (sig >> 16) | 1LLU; + + /* Key is present in the bucket */ + for (i = 0; i < KEYS_PER_BUCKET; i++) { + uint64_t bkt_sig = (uint64_t) bkt->sig[i]; + uint32_t bkt_key_index = bkt->key_pos[i]; + uint8_t *bkt_key = &t->key_mem[bkt_key_index << + t->key_size_shl]; + + if ((sig == bkt_sig) && (keycmp(bkt_key, key, t->key_mask, + t->key_size) == 0)) { + uint8_t *data = &t->data_mem[bkt_key_index << + t->data_size_shl]; + + lru_update(bkt, i); + pkts_mask_out |= pkt_mask; + entries[pkt_index] = (void *) data; + break; + } + } + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_LRU_STATS_PKTS_LOOKUP_MISS(t, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return 0; +} + +/*** +* +* mask = match bitmask +* match = at least one match +* match_many = more than one match +* match_pos = position of first match +* +* ---------------------------------------- +* mask match match_many match_pos +* ---------------------------------------- +* 0000 0 0 00 +* 0001 1 0 00 +* 0010 1 0 01 +* 0011 1 1 00 +* ---------------------------------------- +* 0100 1 0 10 +* 0101 1 1 00 +* 0110 1 1 01 +* 0111 1 1 00 +* ---------------------------------------- +* 1000 1 0 11 +* 1001 1 1 00 +* 1010 1 1 01 +* 1011 1 1 00 +* ---------------------------------------- +* 1100 1 1 10 +* 1101 1 1 00 +* 1110 1 1 01 +* 1111 1 1 00 +* ---------------------------------------- +* +* match = 1111_1111_1111_1110 +* match_many = 1111_1110_1110_1000 +* match_pos = 0001_0010_0001_0011__0001_0010_0001_0000 +* +* match = 0xFFFELLU +* match_many = 0xFEE8LLU +* match_pos = 0x12131210LLU +* +***/ + +#define LUT_MATCH 0xFFFELLU +#define LUT_MATCH_MANY 0xFEE8LLU +#define LUT_MATCH_POS 0x12131210LLU + +#define lookup_cmp_sig(mbuf_sig, bucket, match, match_many, match_pos)\ +{ \ + uint64_t bucket_sig[4], mask[4], mask_all; \ + \ + bucket_sig[0] = bucket->sig[0]; \ + bucket_sig[1] = bucket->sig[1]; \ + bucket_sig[2] = bucket->sig[2]; \ + bucket_sig[3] = bucket->sig[3]; \ + \ + bucket_sig[0] ^= mbuf_sig; \ + bucket_sig[1] ^= mbuf_sig; \ + bucket_sig[2] ^= mbuf_sig; \ + bucket_sig[3] ^= mbuf_sig; \ + \ + mask[0] = 0; \ + mask[1] = 0; \ + mask[2] = 0; \ + mask[3] = 0; \ + \ + if (bucket_sig[0] == 0) \ + mask[0] = 1; \ + if (bucket_sig[1] == 0) \ + mask[1] = 2; \ + if (bucket_sig[2] == 0) \ + mask[2] = 4; \ + if (bucket_sig[3] == 0) \ + mask[3] = 8; \ + \ + mask_all = (mask[0] | mask[1]) | (mask[2] | mask[3]); \ + \ + match = (LUT_MATCH >> mask_all) & 1; \ + match_many = (LUT_MATCH_MANY >> mask_all) & 1; \ + match_pos = (LUT_MATCH_POS >> (mask_all << 1)) & 3; \ +} + +#define lookup_cmp_key(mbuf, key, match_key, f) \ +{ \ + uint64_t *pkt_key = RTE_MBUF_METADATA_UINT64_PTR(mbuf, f->key_offset);\ + uint64_t *bkt_key = (uint64_t *) key; \ + uint64_t *key_mask = f->key_mask; \ + \ + switch (f->key_size) { \ + case 8: \ + { \ + uint64_t xor = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + match_key = 0; \ + if (xor == 0) \ + match_key = 1; \ + } \ + break; \ + \ + case 16: \ + { \ + uint64_t xor[2], or; \ + \ + xor[0] = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + xor[1] = (pkt_key[1] & key_mask[1]) ^ bkt_key[1]; \ + or = xor[0] | xor[1]; \ + match_key = 0; \ + if (or == 0) \ + match_key = 1; \ + } \ + break; \ + \ + case 32: \ + { \ + uint64_t xor[4], or; \ + \ + xor[0] = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + xor[1] = (pkt_key[1] & key_mask[1]) ^ bkt_key[1]; \ + xor[2] = (pkt_key[2] & key_mask[2]) ^ bkt_key[2]; \ + xor[3] = (pkt_key[3] & key_mask[3]) ^ bkt_key[3]; \ + or = xor[0] | xor[1] | xor[2] | xor[3]; \ + match_key = 0; \ + if (or == 0) \ + match_key = 1; \ + } \ + break; \ + \ + case 64: \ + { \ + uint64_t xor[8], or; \ + \ + xor[0] = (pkt_key[0] & key_mask[0]) ^ bkt_key[0]; \ + xor[1] = (pkt_key[1] & key_mask[1]) ^ bkt_key[1]; \ + xor[2] = (pkt_key[2] & key_mask[2]) ^ bkt_key[2]; \ + xor[3] = (pkt_key[3] & key_mask[3]) ^ bkt_key[3]; \ + xor[4] = (pkt_key[4] & key_mask[4]) ^ bkt_key[4]; \ + xor[5] = (pkt_key[5] & key_mask[5]) ^ bkt_key[5]; \ + xor[6] = (pkt_key[6] & key_mask[6]) ^ bkt_key[6]; \ + xor[7] = (pkt_key[7] & key_mask[7]) ^ bkt_key[7]; \ + or = xor[0] | xor[1] | xor[2] | xor[3] | \ + xor[4] | xor[5] | xor[6] | xor[7]; \ + match_key = 0; \ + if (or == 0) \ + match_key = 1; \ + } \ + break; \ + \ + default: \ + match_key = 0; \ + if (keycmp(bkt_key, pkt_key, key_mask, f->key_size) == 0) \ + match_key = 1; \ + } \ +} + +#define lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index)\ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + struct rte_mbuf *mbuf00, *mbuf01; \ + uint32_t key_offset = t->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + mbuf00 = pkts[pkt00_index]; \ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + mbuf01 = pkts[pkt01_index]; \ + \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask, pkt00_index, \ + pkt01_index) \ +{ \ + uint64_t pkt00_mask, pkt01_mask; \ + struct rte_mbuf *mbuf00, *mbuf01; \ + uint32_t key_offset = t->key_offset; \ + \ + pkt00_index = __builtin_ctzll(pkts_mask); \ + pkt00_mask = 1LLU << pkt00_index; \ + pkts_mask &= ~pkt00_mask; \ + mbuf00 = pkts[pkt00_index]; \ + \ + pkt01_index = __builtin_ctzll(pkts_mask); \ + if (pkts_mask == 0) \ + pkt01_index = pkt00_index; \ + \ + pkt01_mask = 1LLU << pkt01_index; \ + pkts_mask &= ~pkt01_mask; \ + mbuf01 = pkts[pkt01_index]; \ + \ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, key_offset));\ + rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, key_offset));\ +} + +#define lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index)\ +{ \ + struct grinder *g10, *g11; \ + uint64_t sig10, sig11, bkt10_index, bkt11_index; \ + struct rte_mbuf *mbuf10, *mbuf11; \ + struct bucket *bkt10, *bkt11, *buckets = t->buckets; \ + uint8_t *key10, *key11; \ + uint64_t bucket_mask = t->bucket_mask; \ + rte_table_hash_op_hash f_hash = t->f_hash; \ + uint64_t seed = t->seed; \ + uint32_t key_size = t->key_size; \ + uint32_t key_offset = t->key_offset; \ + \ + mbuf10 = pkts[pkt10_index]; \ + key10 = RTE_MBUF_METADATA_UINT8_PTR(mbuf10, key_offset);\ + sig10 = (uint64_t) f_hash(key10, t->key_mask, key_size, seed);\ + bkt10_index = sig10 & bucket_mask; \ + bkt10 = &buckets[bkt10_index]; \ + \ + mbuf11 = pkts[pkt11_index]; \ + key11 = RTE_MBUF_METADATA_UINT8_PTR(mbuf11, key_offset);\ + sig11 = (uint64_t) f_hash(key11, t->key_mask, key_size, seed);\ + bkt11_index = sig11 & bucket_mask; \ + bkt11 = &buckets[bkt11_index]; \ + \ + rte_prefetch0(bkt10); \ + rte_prefetch0(bkt11); \ + \ + g10 = &g[pkt10_index]; \ + g10->sig = sig10; \ + g10->bkt = bkt10; \ + \ + g11 = &g[pkt11_index]; \ + g11->sig = sig11; \ + g11->bkt = bkt11; \ +} + +#define lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many)\ +{ \ + struct grinder *g20, *g21; \ + uint64_t sig20, sig21; \ + struct bucket *bkt20, *bkt21; \ + uint8_t *key20, *key21, *key_mem = t->key_mem; \ + uint64_t match20, match21, match_many20, match_many21; \ + uint64_t match_pos20, match_pos21; \ + uint32_t key20_index, key21_index, key_size_shl = t->key_size_shl;\ + \ + g20 = &g[pkt20_index]; \ + sig20 = g20->sig; \ + bkt20 = g20->bkt; \ + sig20 = (sig20 >> 16) | 1LLU; \ + lookup_cmp_sig(sig20, bkt20, match20, match_many20, match_pos20);\ + match20 <<= pkt20_index; \ + match_many20 <<= pkt20_index; \ + key20_index = bkt20->key_pos[match_pos20]; \ + key20 = &key_mem[key20_index << key_size_shl]; \ + \ + g21 = &g[pkt21_index]; \ + sig21 = g21->sig; \ + bkt21 = g21->bkt; \ + sig21 = (sig21 >> 16) | 1LLU; \ + lookup_cmp_sig(sig21, bkt21, match21, match_many21, match_pos21);\ + match21 <<= pkt21_index; \ + match_many21 <<= pkt21_index; \ + key21_index = bkt21->key_pos[match_pos21]; \ + key21 = &key_mem[key21_index << key_size_shl]; \ + \ + rte_prefetch0(key20); \ + rte_prefetch0(key21); \ + \ + pkts_mask_match_many |= match_many20 | match_many21; \ + \ + g20->match = match20; \ + g20->match_pos = match_pos20; \ + g20->key_index = key20_index; \ + \ + g21->match = match21; \ + g21->match_pos = match_pos21; \ + g21->key_index = key21_index; \ +} + +#define lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, \ + entries) \ +{ \ + struct grinder *g30, *g31; \ + struct rte_mbuf *mbuf30, *mbuf31; \ + struct bucket *bkt30, *bkt31; \ + uint8_t *key30, *key31, *key_mem = t->key_mem; \ + uint8_t *data30, *data31, *data_mem = t->data_mem; \ + uint64_t match30, match31, match_pos30, match_pos31; \ + uint64_t match_key30, match_key31, match_keys; \ + uint32_t key30_index, key31_index; \ + uint32_t key_size_shl = t->key_size_shl; \ + uint32_t data_size_shl = t->data_size_shl; \ + \ + mbuf30 = pkts[pkt30_index]; \ + g30 = &g[pkt30_index]; \ + bkt30 = g30->bkt; \ + match30 = g30->match; \ + match_pos30 = g30->match_pos; \ + key30_index = g30->key_index; \ + key30 = &key_mem[key30_index << key_size_shl]; \ + lookup_cmp_key(mbuf30, key30, match_key30, t); \ + match_key30 <<= pkt30_index; \ + match_key30 &= match30; \ + data30 = &data_mem[key30_index << data_size_shl]; \ + entries[pkt30_index] = data30; \ + \ + mbuf31 = pkts[pkt31_index]; \ + g31 = &g[pkt31_index]; \ + bkt31 = g31->bkt; \ + match31 = g31->match; \ + match_pos31 = g31->match_pos; \ + key31_index = g31->key_index; \ + key31 = &key_mem[key31_index << key_size_shl]; \ + lookup_cmp_key(mbuf31, key31, match_key31, t); \ + match_key31 <<= pkt31_index; \ + match_key31 &= match31; \ + data31 = &data_mem[key31_index << data_size_shl]; \ + entries[pkt31_index] = data31; \ + \ + rte_prefetch0(data30); \ + rte_prefetch0(data31); \ + \ + match_keys = match_key30 | match_key31; \ + pkts_mask_out |= match_keys; \ + \ + if (match_key30 == 0) \ + match_pos30 = 4; \ + lru_update(bkt30, match_pos30); \ + \ + if (match_key31 == 0) \ + match_pos31 = 4; \ + lru_update(bkt31, match_pos31); \ +} + +/*** +* The lookup function implements a 4-stage pipeline, with each stage processing +* two different packets. The purpose of pipelined implementation is to hide the +* latency of prefetching the data structures and loosen the data dependency +* between instructions. +* +* p00 _______ p10 _______ p20 _______ p30 _______ +* ----->| |----->| |----->| |----->| |-----> +* | 0 | | 1 | | 2 | | 3 | +* ----->|_______|----->|_______|----->|_______|----->|_______|-----> +* p01 p11 p21 p31 +* +* The naming convention is: +* pXY = packet Y of stage X, X = 0 .. 3, Y = 0 .. 1 +* +***/ +static int rte_table_hash_lru_lookup( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_hash *t = (struct rte_table_hash *) table; + struct grinder *g = t->grinders; + uint64_t pkt00_index, pkt01_index, pkt10_index, pkt11_index; + uint64_t pkt20_index, pkt21_index, pkt30_index, pkt31_index; + uint64_t pkts_mask_out = 0, pkts_mask_match_many = 0; + int status = 0; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_HASH_LRU_STATS_PKTS_IN_ADD(t, n_pkts_in); + + /* Cannot run the pipeline with less than 7 packets */ + if (__builtin_popcountll(pkts_mask) < 7) + return rte_table_hash_lru_lookup_unoptimized(table, pkts, + pkts_mask, lookup_hit_mask, entries); + + /* Pipeline stage 0 */ + lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index); + + /* Pipeline feed */ + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index); + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline feed */ + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index); + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many); + + /* + * Pipeline run + * + */ + for ( ; pkts_mask; ) { + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 0 */ + lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask, + pkt00_index, pkt01_index); + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, + pkts_mask_match_many); + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, + pkts_mask_out, entries); + } + + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + pkt10_index = pkt00_index; + pkt11_index = pkt01_index; + + /* Pipeline stage 1 */ + lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index); + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many); + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, + entries); + + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + pkt20_index = pkt10_index; + pkt21_index = pkt11_index; + + /* Pipeline stage 2 */ + lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many); + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, + entries); + + /* Pipeline feed */ + pkt30_index = pkt20_index; + pkt31_index = pkt21_index; + + /* Pipeline stage 3 */ + lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, + entries); + + /* Slow path */ + pkts_mask_match_many &= ~pkts_mask_out; + if (pkts_mask_match_many) { + uint64_t pkts_mask_out_slow = 0; + + status = rte_table_hash_lru_lookup_unoptimized(table, pkts, + pkts_mask_match_many, &pkts_mask_out_slow, entries); + pkts_mask_out |= pkts_mask_out_slow; + } + + *lookup_hit_mask = pkts_mask_out; + RTE_TABLE_HASH_LRU_STATS_PKTS_LOOKUP_MISS(t, n_pkts_in - __builtin_popcountll(pkts_mask_out)); + return status; +} + +static int +rte_table_hash_lru_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_hash *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_hash_lru_ops = { + .f_create = rte_table_hash_lru_create, + .f_free = rte_table_hash_lru_free, + .f_add = rte_table_hash_lru_entry_add, + .f_delete = rte_table_hash_lru_entry_delete, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_hash_lru_lookup, + .f_stats = rte_table_hash_lru_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_lpm.c b/src/spdk/dpdk/lib/librte_table/rte_table_lpm.c new file mode 100644 index 00000000..4dd8289c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_lpm.c @@ -0,0 +1,366 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "rte_table_lpm.h" + +#ifndef RTE_TABLE_LPM_MAX_NEXT_HOPS +#define RTE_TABLE_LPM_MAX_NEXT_HOPS 65536 +#endif + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_LPM_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_LPM_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_LPM_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_LPM_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_table_lpm { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t entry_size; + uint32_t entry_unique_size; + uint32_t n_rules; + uint32_t offset; + + /* Handle to low-level LPM table */ + struct rte_lpm *lpm; + + /* Next Hop Table (NHT) */ + uint32_t nht_users[RTE_TABLE_LPM_MAX_NEXT_HOPS]; + uint8_t nht[0] __rte_cache_aligned; +}; + +static void * +rte_table_lpm_create(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_lpm_params *p = params; + struct rte_table_lpm *lpm; + struct rte_lpm_config lpm_config; + + uint32_t total_size, nht_size; + + /* Check input parameters */ + if (p == NULL) { + RTE_LOG(ERR, TABLE, "%s: NULL input parameters\n", __func__); + return NULL; + } + if (p->n_rules == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid n_rules\n", __func__); + return NULL; + } + if (p->number_tbl8s == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid number_tbl8s\n", __func__); + return NULL; + } + if (p->entry_unique_size == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid entry_unique_size\n", + __func__); + return NULL; + } + if (p->entry_unique_size > entry_size) { + RTE_LOG(ERR, TABLE, "%s: Invalid entry_unique_size\n", + __func__); + return NULL; + } + if (p->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: Table name is NULL\n", + __func__); + return NULL; + } + entry_size = RTE_ALIGN(entry_size, sizeof(uint64_t)); + + /* Memory allocation */ + nht_size = RTE_TABLE_LPM_MAX_NEXT_HOPS * entry_size; + total_size = sizeof(struct rte_table_lpm) + nht_size; + lpm = rte_zmalloc_socket("TABLE", total_size, RTE_CACHE_LINE_SIZE, + socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %u bytes for LPM table\n", + __func__, total_size); + return NULL; + } + + /* LPM low-level table creation */ + lpm_config.max_rules = p->n_rules; + lpm_config.number_tbl8s = p->number_tbl8s; + lpm_config.flags = p->flags; + lpm->lpm = rte_lpm_create(p->name, socket_id, &lpm_config); + + if (lpm->lpm == NULL) { + rte_free(lpm); + RTE_LOG(ERR, TABLE, "Unable to create low-level LPM table\n"); + return NULL; + } + + /* Memory initialization */ + lpm->entry_size = entry_size; + lpm->entry_unique_size = p->entry_unique_size; + lpm->n_rules = p->n_rules; + lpm->offset = p->offset; + + return lpm; +} + +static int +rte_table_lpm_free(void *table) +{ + struct rte_table_lpm *lpm = table; + + /* Check input parameters */ + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + /* Free previously allocated resources */ + rte_lpm_free(lpm->lpm); + rte_free(lpm); + + return 0; +} + +static int +nht_find_free(struct rte_table_lpm *lpm, uint32_t *pos) +{ + uint32_t i; + + for (i = 0; i < RTE_TABLE_LPM_MAX_NEXT_HOPS; i++) { + if (lpm->nht_users[i] == 0) { + *pos = i; + return 1; + } + } + + return 0; +} + +static int +nht_find_existing(struct rte_table_lpm *lpm, void *entry, uint32_t *pos) +{ + uint32_t i; + + for (i = 0; i < RTE_TABLE_LPM_MAX_NEXT_HOPS; i++) { + uint8_t *nht_entry = &lpm->nht[i * lpm->entry_size]; + + if ((lpm->nht_users[i] > 0) && (memcmp(nht_entry, entry, + lpm->entry_unique_size) == 0)) { + *pos = i; + return 1; + } + } + + return 0; +} + +static int +rte_table_lpm_entry_add( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_lpm *lpm = table; + struct rte_table_lpm_key *ip_prefix = key; + uint32_t nht_pos, nht_pos0_valid; + int status; + uint32_t nht_pos0 = 0; + + /* Check input parameters */ + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (ip_prefix == NULL) { + RTE_LOG(ERR, TABLE, "%s: ip_prefix parameter is NULL\n", + __func__); + return -EINVAL; + } + if (entry == NULL) { + RTE_LOG(ERR, TABLE, "%s: entry parameter is NULL\n", __func__); + return -EINVAL; + } + + if ((ip_prefix->depth == 0) || (ip_prefix->depth > 32)) { + RTE_LOG(ERR, TABLE, "%s: invalid depth (%d)\n", + __func__, ip_prefix->depth); + return -EINVAL; + } + + /* Check if rule is already present in the table */ + status = rte_lpm_is_rule_present(lpm->lpm, ip_prefix->ip, + ip_prefix->depth, &nht_pos0); + nht_pos0_valid = status > 0; + + /* Find existing or free NHT entry */ + if (nht_find_existing(lpm, entry, &nht_pos) == 0) { + uint8_t *nht_entry; + + if (nht_find_free(lpm, &nht_pos) == 0) { + RTE_LOG(ERR, TABLE, "%s: NHT full\n", __func__); + return -1; + } + + nht_entry = &lpm->nht[nht_pos * lpm->entry_size]; + memcpy(nht_entry, entry, lpm->entry_size); + } + + /* Add rule to low level LPM table */ + if (rte_lpm_add(lpm->lpm, ip_prefix->ip, ip_prefix->depth, nht_pos) < 0) { + RTE_LOG(ERR, TABLE, "%s: LPM rule add failed\n", __func__); + return -1; + } + + /* Commit NHT changes */ + lpm->nht_users[nht_pos]++; + lpm->nht_users[nht_pos0] -= nht_pos0_valid; + + *key_found = nht_pos0_valid; + *entry_ptr = (void *) &lpm->nht[nht_pos * lpm->entry_size]; + return 0; +} + +static int +rte_table_lpm_entry_delete( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_lpm *lpm = table; + struct rte_table_lpm_key *ip_prefix = key; + uint32_t nht_pos; + int status; + + /* Check input parameters */ + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (ip_prefix == NULL) { + RTE_LOG(ERR, TABLE, "%s: ip_prefix parameter is NULL\n", + __func__); + return -EINVAL; + } + if ((ip_prefix->depth == 0) || (ip_prefix->depth > 32)) { + RTE_LOG(ERR, TABLE, "%s: invalid depth (%d)\n", __func__, + ip_prefix->depth); + return -EINVAL; + } + + /* Return if rule is not present in the table */ + status = rte_lpm_is_rule_present(lpm->lpm, ip_prefix->ip, + ip_prefix->depth, &nht_pos); + if (status < 0) { + RTE_LOG(ERR, TABLE, "%s: LPM algorithmic error\n", __func__); + return -1; + } + if (status == 0) { + *key_found = 0; + return 0; + } + + /* Delete rule from the low-level LPM table */ + status = rte_lpm_delete(lpm->lpm, ip_prefix->ip, ip_prefix->depth); + if (status) { + RTE_LOG(ERR, TABLE, "%s: LPM rule delete failed\n", __func__); + return -1; + } + + /* Commit NHT changes */ + lpm->nht_users[nht_pos]--; + + *key_found = 1; + if (entry) + memcpy(entry, &lpm->nht[nht_pos * lpm->entry_size], + lpm->entry_size); + + return 0; +} + +static int +rte_table_lpm_lookup( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_lpm *lpm = (struct rte_table_lpm *) table; + uint64_t pkts_out_mask = 0; + uint32_t i; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_LPM_STATS_PKTS_IN_ADD(lpm, n_pkts_in); + + pkts_out_mask = 0; + for (i = 0; i < (uint32_t)(RTE_PORT_IN_BURST_SIZE_MAX - + __builtin_clzll(pkts_mask)); i++) { + uint64_t pkt_mask = 1LLU << i; + + if (pkt_mask & pkts_mask) { + struct rte_mbuf *pkt = pkts[i]; + uint32_t ip = rte_bswap32( + RTE_MBUF_METADATA_UINT32(pkt, lpm->offset)); + int status; + uint32_t nht_pos; + + status = rte_lpm_lookup(lpm->lpm, ip, &nht_pos); + if (status == 0) { + pkts_out_mask |= pkt_mask; + entries[i] = (void *) &lpm->nht[nht_pos * + lpm->entry_size]; + } + } + } + + *lookup_hit_mask = pkts_out_mask; + RTE_TABLE_LPM_STATS_PKTS_LOOKUP_MISS(lpm, n_pkts_in - __builtin_popcountll(pkts_out_mask)); + return 0; +} + +static int +rte_table_lpm_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_lpm *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_lpm_ops = { + .f_create = rte_table_lpm_create, + .f_free = rte_table_lpm_free, + .f_add = rte_table_lpm_entry_add, + .f_delete = rte_table_lpm_entry_delete, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_lpm_lookup, + .f_stats = rte_table_lpm_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_lpm.h b/src/spdk/dpdk/lib/librte_table/rte_table_lpm.h new file mode 100644 index 00000000..571ff2f0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_lpm.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_LPM_H__ +#define __INCLUDE_RTE_TABLE_LPM_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table LPM for IPv4 + * + * This table uses the Longest Prefix Match (LPM) algorithm to uniquely + * associate data to lookup keys. + * + * Use-case: IP routing table. Routes that are added to the table associate a + * next hop to an IP prefix. The IP prefix is specified as IP address and depth + * and cover for a multitude of lookup keys (i.e. destination IP addresses) + * that all share the same data (i.e. next hop). The next hop information + * typically contains the output interface ID, the IP address of the next hop + * station (which is part of the same IP network the output interface is + * connected to) and other flags and counters. + * + * The LPM primitive only allows associating an 8-bit number (next hop ID) to + * an IP prefix, while a routing table can potentially contain thousands of + * routes or even more. This means that the same next hop ID (and next hop + * information) has to be shared by multiple routes, which makes sense, as + * multiple remote networks could be reached through the same next hop. + * Therefore, when a route is added or updated, the LPM table has to check + * whether the same next hop is already in use before using a new next hop ID + * for this route. + * + * The comparison between different next hops is done for the first + * “entry_unique_size” bytes of the next hop information (configurable + * parameter), which have to uniquely identify the next hop, therefore the user + * has to carefully manage the format of the LPM table entry (i.e. the next + * hop information) so that any next hop data that changes value during + * run-time (e.g. counters) is placed outside of this area. + * + ***/ + +#include + +#include "rte_table.h" + +/** LPM table parameters */ +struct rte_table_lpm_params { + /** Table name */ + const char *name; + + /** Maximum number of LPM rules (i.e. IP routes) */ + uint32_t n_rules; + + /**< Number of tbl8s to allocate. */ + uint32_t number_tbl8s; + + /**< This field is currently unused. */ + int flags; + + /** Number of bytes at the start of the table entry that uniquely + identify the entry. Cannot be bigger than table entry size. */ + uint32_t entry_unique_size; + + /** Byte offset within input packet meta-data where lookup key (i.e. + the destination IP address) is located. */ + uint32_t offset; +}; + +/** LPM table rule (i.e. route), specified as IP prefix. While the key used by +the lookup operation is the destination IP address (read from the input packet +meta-data), the entry add and entry delete operations work with LPM rules, with +each rule covering for a multitude of lookup keys (destination IP addresses) +that share the same data (next hop). */ +struct rte_table_lpm_key { + /** IP address */ + uint32_t ip; + + /** IP address depth. The most significant "depth" bits of the IP + address specify the network part of the IP address, while the rest of + the bits specify the host part of the address and are ignored for the + purpose of route specification. */ + uint8_t depth; +}; + +/** LPM table operations */ +extern struct rte_table_ops rte_table_lpm_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.c b/src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.c new file mode 100644 index 00000000..a55f808a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.c @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "rte_table_lpm_ipv6.h" + +#define RTE_TABLE_LPM_MAX_NEXT_HOPS 256 + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_LPM_IPV6_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_LPM_IPV6_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_LPM_IPV6_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_LPM_IPV6_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_table_lpm_ipv6 { + struct rte_table_stats stats; + + /* Input parameters */ + uint32_t entry_size; + uint32_t entry_unique_size; + uint32_t n_rules; + uint32_t offset; + + /* Handle to low-level LPM table */ + struct rte_lpm6 *lpm; + + /* Next Hop Table (NHT) */ + uint32_t nht_users[RTE_TABLE_LPM_MAX_NEXT_HOPS]; + uint8_t nht[0] __rte_cache_aligned; +}; + +static void * +rte_table_lpm_ipv6_create(void *params, int socket_id, uint32_t entry_size) +{ + struct rte_table_lpm_ipv6_params *p = + params; + struct rte_table_lpm_ipv6 *lpm; + struct rte_lpm6_config lpm6_config; + uint32_t total_size, nht_size; + + /* Check input parameters */ + if (p == NULL) { + RTE_LOG(ERR, TABLE, "%s: NULL input parameters\n", __func__); + return NULL; + } + if (p->n_rules == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid n_rules\n", __func__); + return NULL; + } + if (p->number_tbl8s == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid n_rules\n", __func__); + return NULL; + } + if (p->entry_unique_size == 0) { + RTE_LOG(ERR, TABLE, "%s: Invalid entry_unique_size\n", + __func__); + return NULL; + } + if (p->entry_unique_size > entry_size) { + RTE_LOG(ERR, TABLE, "%s: Invalid entry_unique_size\n", + __func__); + return NULL; + } + if (p->name == NULL) { + RTE_LOG(ERR, TABLE, "%s: Table name is NULL\n", + __func__); + return NULL; + } + entry_size = RTE_ALIGN(entry_size, sizeof(uint64_t)); + + /* Memory allocation */ + nht_size = RTE_TABLE_LPM_MAX_NEXT_HOPS * entry_size; + total_size = sizeof(struct rte_table_lpm_ipv6) + nht_size; + lpm = rte_zmalloc_socket("TABLE", total_size, RTE_CACHE_LINE_SIZE, + socket_id); + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %u bytes for LPM IPv6 table\n", + __func__, total_size); + return NULL; + } + + /* LPM low-level table creation */ + lpm6_config.max_rules = p->n_rules; + lpm6_config.number_tbl8s = p->number_tbl8s; + lpm6_config.flags = 0; + lpm->lpm = rte_lpm6_create(p->name, socket_id, &lpm6_config); + if (lpm->lpm == NULL) { + rte_free(lpm); + RTE_LOG(ERR, TABLE, + "Unable to create low-level LPM IPv6 table\n"); + return NULL; + } + + /* Memory initialization */ + lpm->entry_size = entry_size; + lpm->entry_unique_size = p->entry_unique_size; + lpm->n_rules = p->n_rules; + lpm->offset = p->offset; + + return lpm; +} + +static int +rte_table_lpm_ipv6_free(void *table) +{ + struct rte_table_lpm_ipv6 *lpm = table; + + /* Check input parameters */ + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + + /* Free previously allocated resources */ + rte_lpm6_free(lpm->lpm); + rte_free(lpm); + + return 0; +} + +static int +nht_find_free(struct rte_table_lpm_ipv6 *lpm, uint32_t *pos) +{ + uint32_t i; + + for (i = 0; i < RTE_TABLE_LPM_MAX_NEXT_HOPS; i++) { + if (lpm->nht_users[i] == 0) { + *pos = i; + return 1; + } + } + + return 0; +} + +static int +nht_find_existing(struct rte_table_lpm_ipv6 *lpm, void *entry, uint32_t *pos) +{ + uint32_t i; + + for (i = 0; i < RTE_TABLE_LPM_MAX_NEXT_HOPS; i++) { + uint8_t *nht_entry = &lpm->nht[i * lpm->entry_size]; + + if ((lpm->nht_users[i] > 0) && (memcmp(nht_entry, entry, + lpm->entry_unique_size) == 0)) { + *pos = i; + return 1; + } + } + + return 0; +} + +static int +rte_table_lpm_ipv6_entry_add( + void *table, + void *key, + void *entry, + int *key_found, + void **entry_ptr) +{ + struct rte_table_lpm_ipv6 *lpm = table; + struct rte_table_lpm_ipv6_key *ip_prefix = + key; + uint32_t nht_pos, nht_pos0, nht_pos0_valid; + int status; + + /* Check input parameters */ + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (ip_prefix == NULL) { + RTE_LOG(ERR, TABLE, "%s: ip_prefix parameter is NULL\n", + __func__); + return -EINVAL; + } + if (entry == NULL) { + RTE_LOG(ERR, TABLE, "%s: entry parameter is NULL\n", __func__); + return -EINVAL; + } + + if ((ip_prefix->depth == 0) || (ip_prefix->depth > 128)) { + RTE_LOG(ERR, TABLE, "%s: invalid depth (%d)\n", __func__, + ip_prefix->depth); + return -EINVAL; + } + + /* Check if rule is already present in the table */ + status = rte_lpm6_is_rule_present(lpm->lpm, ip_prefix->ip, + ip_prefix->depth, &nht_pos0); + nht_pos0_valid = status > 0; + + /* Find existing or free NHT entry */ + if (nht_find_existing(lpm, entry, &nht_pos) == 0) { + uint8_t *nht_entry; + + if (nht_find_free(lpm, &nht_pos) == 0) { + RTE_LOG(ERR, TABLE, "%s: NHT full\n", __func__); + return -1; + } + + nht_entry = &lpm->nht[nht_pos * lpm->entry_size]; + memcpy(nht_entry, entry, lpm->entry_size); + } + + /* Add rule to low level LPM table */ + if (rte_lpm6_add(lpm->lpm, ip_prefix->ip, ip_prefix->depth, + nht_pos) < 0) { + RTE_LOG(ERR, TABLE, "%s: LPM IPv6 rule add failed\n", __func__); + return -1; + } + + /* Commit NHT changes */ + lpm->nht_users[nht_pos]++; + lpm->nht_users[nht_pos0] -= nht_pos0_valid; + + *key_found = nht_pos0_valid; + *entry_ptr = (void *) &lpm->nht[nht_pos * lpm->entry_size]; + return 0; +} + +static int +rte_table_lpm_ipv6_entry_delete( + void *table, + void *key, + int *key_found, + void *entry) +{ + struct rte_table_lpm_ipv6 *lpm = table; + struct rte_table_lpm_ipv6_key *ip_prefix = + key; + uint32_t nht_pos; + int status; + + /* Check input parameters */ + if (lpm == NULL) { + RTE_LOG(ERR, TABLE, "%s: table parameter is NULL\n", __func__); + return -EINVAL; + } + if (ip_prefix == NULL) { + RTE_LOG(ERR, TABLE, "%s: ip_prefix parameter is NULL\n", + __func__); + return -EINVAL; + } + if ((ip_prefix->depth == 0) || (ip_prefix->depth > 128)) { + RTE_LOG(ERR, TABLE, "%s: invalid depth (%d)\n", __func__, + ip_prefix->depth); + return -EINVAL; + } + + /* Return if rule is not present in the table */ + status = rte_lpm6_is_rule_present(lpm->lpm, ip_prefix->ip, + ip_prefix->depth, &nht_pos); + if (status < 0) { + RTE_LOG(ERR, TABLE, "%s: LPM IPv6 algorithmic error\n", + __func__); + return -1; + } + if (status == 0) { + *key_found = 0; + return 0; + } + + /* Delete rule from the low-level LPM table */ + status = rte_lpm6_delete(lpm->lpm, ip_prefix->ip, ip_prefix->depth); + if (status) { + RTE_LOG(ERR, TABLE, "%s: LPM IPv6 rule delete failed\n", + __func__); + return -1; + } + + /* Commit NHT changes */ + lpm->nht_users[nht_pos]--; + + *key_found = 1; + if (entry) + memcpy(entry, &lpm->nht[nht_pos * lpm->entry_size], + lpm->entry_size); + + return 0; +} + +static int +rte_table_lpm_ipv6_lookup( + void *table, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + void **entries) +{ + struct rte_table_lpm_ipv6 *lpm = (struct rte_table_lpm_ipv6 *) table; + uint64_t pkts_out_mask = 0; + uint32_t i; + + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + RTE_TABLE_LPM_IPV6_STATS_PKTS_IN_ADD(lpm, n_pkts_in); + + pkts_out_mask = 0; + for (i = 0; i < (uint32_t)(RTE_PORT_IN_BURST_SIZE_MAX - + __builtin_clzll(pkts_mask)); i++) { + uint64_t pkt_mask = 1LLU << i; + + if (pkt_mask & pkts_mask) { + struct rte_mbuf *pkt = pkts[i]; + uint8_t *ip = RTE_MBUF_METADATA_UINT8_PTR(pkt, + lpm->offset); + int status; + uint32_t nht_pos; + + status = rte_lpm6_lookup(lpm->lpm, ip, &nht_pos); + if (status == 0) { + pkts_out_mask |= pkt_mask; + entries[i] = (void *) &lpm->nht[nht_pos * + lpm->entry_size]; + } + } + } + + *lookup_hit_mask = pkts_out_mask; + RTE_TABLE_LPM_IPV6_STATS_PKTS_LOOKUP_MISS(lpm, n_pkts_in - __builtin_popcountll(pkts_out_mask)); + return 0; +} + +static int +rte_table_lpm_ipv6_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_lpm_ipv6 *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_lpm_ipv6_ops = { + .f_create = rte_table_lpm_ipv6_create, + .f_free = rte_table_lpm_ipv6_free, + .f_add = rte_table_lpm_ipv6_entry_add, + .f_delete = rte_table_lpm_ipv6_entry_delete, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_lpm_ipv6_lookup, + .f_stats = rte_table_lpm_ipv6_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.h b/src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.h new file mode 100644 index 00000000..eadc316e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_lpm_ipv6.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_LPM_IPV6_H__ +#define __INCLUDE_RTE_TABLE_LPM_IPV6_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table LPM for IPv6 + * + * This table uses the Longest Prefix Match (LPM) algorithm to uniquely + * associate data to lookup keys. + * + * Use-case: IP routing table. Routes that are added to the table associate a + * next hop to an IP prefix. The IP prefix is specified as IP address and depth + * and cover for a multitude of lookup keys (i.e. destination IP addresses) + * that all share the same data (i.e. next hop). The next hop information + * typically contains the output interface ID, the IP address of the next hop + * station (which is part of the same IP network the output interface is + * connected to) and other flags and counters. + * + * The LPM primitive only allows associating an 8-bit number (next hop ID) to + * an IP prefix, while a routing table can potentially contain thousands of + * routes or even more. This means that the same next hop ID (and next hop + * information) has to be shared by multiple routes, which makes sense, as + * multiple remote networks could be reached through the same next hop. + * Therefore, when a route is added or updated, the LPM table has to check + * whether the same next hop is already in use before using a new next hop ID + * for this route. + * + * The comparison between different next hops is done for the first + * “entry_unique_size” bytes of the next hop information (configurable + * parameter), which have to uniquely identify the next hop, therefore the user + * has to carefully manage the format of the LPM table entry (i.e. the next + * hop information) so that any next hop data that changes value during + * run-time (e.g. counters) is placed outside of this area. + * + ***/ + +#include + +#include "rte_table.h" + +#define RTE_LPM_IPV6_ADDR_SIZE 16 + +/** LPM table parameters */ +struct rte_table_lpm_ipv6_params { + /** Table name */ + const char *name; + + /** Maximum number of LPM rules (i.e. IP routes) */ + uint32_t n_rules; + + uint32_t number_tbl8s; + + /** Number of bytes at the start of the table entry that uniquely + identify the entry. Cannot be bigger than table entry size. */ + uint32_t entry_unique_size; + + /** Byte offset within input packet meta-data where lookup key (i.e. + the destination IP address) is located. */ + uint32_t offset; +}; + +/** LPM table rule (i.e. route), specified as IP prefix. While the key used by +the lookup operation is the destination IP address (read from the input packet +meta-data), the entry add and entry delete operations work with LPM rules, with +each rule covering for a multitude of lookup keys (destination IP addresses) +that share the same data (next hop). */ +struct rte_table_lpm_ipv6_key { + /** IP address */ + uint8_t ip[RTE_LPM_IPV6_ADDR_SIZE]; + + /** IP address depth. The most significant "depth" bits of the IP + address specify the network part of the IP address, while the rest of + the bits specify the host part of the address and are ignored for the + purpose of route specification. */ + uint8_t depth; +}; + +/** LPM table operations */ +extern struct rte_table_ops rte_table_lpm_ipv6_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_stub.c b/src/spdk/dpdk/lib/librte_table/rte_table_stub.c new file mode 100644 index 00000000..9ce7be9c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_stub.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include + +#include +#include + +#include "rte_table_stub.h" + +#ifdef RTE_TABLE_STATS_COLLECT + +#define RTE_TABLE_LPM_STATS_PKTS_IN_ADD(table, val) \ + table->stats.n_pkts_in += val +#define RTE_TABLE_LPM_STATS_PKTS_LOOKUP_MISS(table, val) \ + table->stats.n_pkts_lookup_miss += val + +#else + +#define RTE_TABLE_LPM_STATS_PKTS_IN_ADD(table, val) +#define RTE_TABLE_LPM_STATS_PKTS_LOOKUP_MISS(table, val) + +#endif + +struct rte_table_stub { + struct rte_table_stats stats; +}; + +static void * +rte_table_stub_create(__rte_unused void *params, + __rte_unused int socket_id, + __rte_unused uint32_t entry_size) +{ + struct rte_table_stub *stub; + uint32_t size; + + size = sizeof(struct rte_table_stub); + stub = rte_zmalloc_socket("TABLE", size, RTE_CACHE_LINE_SIZE, + socket_id); + if (stub == NULL) { + RTE_LOG(ERR, TABLE, + "%s: Cannot allocate %u bytes for stub table\n", + __func__, size); + return NULL; + } + + return stub; +} + +static int +rte_table_stub_lookup( + __rte_unused void *table, + __rte_unused struct rte_mbuf **pkts, + __rte_unused uint64_t pkts_mask, + uint64_t *lookup_hit_mask, + __rte_unused void **entries) +{ + __rte_unused struct rte_table_stub *stub = (struct rte_table_stub *) table; + __rte_unused uint32_t n_pkts_in = __builtin_popcountll(pkts_mask); + + RTE_TABLE_LPM_STATS_PKTS_IN_ADD(stub, n_pkts_in); + *lookup_hit_mask = 0; + RTE_TABLE_LPM_STATS_PKTS_LOOKUP_MISS(stub, n_pkts_in); + + return 0; +} + +static int +rte_table_stub_stats_read(void *table, struct rte_table_stats *stats, int clear) +{ + struct rte_table_stub *t = table; + + if (stats != NULL) + memcpy(stats, &t->stats, sizeof(t->stats)); + + if (clear) + memset(&t->stats, 0, sizeof(t->stats)); + + return 0; +} + +struct rte_table_ops rte_table_stub_ops = { + .f_create = rte_table_stub_create, + .f_free = NULL, + .f_add = NULL, + .f_delete = NULL, + .f_add_bulk = NULL, + .f_delete_bulk = NULL, + .f_lookup = rte_table_stub_lookup, + .f_stats = rte_table_stub_stats_read, +}; diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_stub.h b/src/spdk/dpdk/lib/librte_table/rte_table_stub.h new file mode 100644 index 00000000..2b407397 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_stub.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef __INCLUDE_RTE_TABLE_STUB_H__ +#define __INCLUDE_RTE_TABLE_STUB_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file + * RTE Table Stub + * + * The stub table lookup operation produces lookup miss for all input packets. + * + ***/ + +#include + +#include "rte_table.h" + +/** Stub table parameters: NONE */ + +/** Stub table operations */ +extern struct rte_table_ops rte_table_stub_ops; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_table/rte_table_version.map b/src/spdk/dpdk/lib/librte_table/rte_table_version.map new file mode 100644 index 00000000..6237252b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_table/rte_table_version.map @@ -0,0 +1,20 @@ +DPDK_17.11 { + global: + + rte_table_acl_ops; + rte_table_array_ops; + rte_table_hash_cuckoo_ops; + rte_table_hash_ext_ops; + rte_table_hash_key16_ext_ops; + rte_table_hash_key16_lru_ops; + rte_table_hash_key32_ext_ops; + rte_table_hash_key32_lru_ops; + rte_table_hash_key8_ext_ops; + rte_table_hash_key8_lru_ops; + rte_table_hash_lru_ops; + rte_table_lpm_ipv6_ops; + rte_table_lpm_ops; + rte_table_stub_ops; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_timer/Makefile b/src/spdk/dpdk/lib/librte_timer/Makefile new file mode 100644 index 00000000..4ebd5289 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_timer/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_timer.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 +LDLIBS += -lrte_eal + +EXPORT_MAP := rte_timer_version.map + +LIBABIVER := 1 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_TIMER) := rte_timer.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_TIMER)-include := rte_timer.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_timer/meson.build b/src/spdk/dpdk/lib/librte_timer/meson.build new file mode 100644 index 00000000..89b17e03 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_timer/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +sources = files('rte_timer.c') +headers = files('rte_timer.h') diff --git a/src/spdk/dpdk/lib/librte_timer/rte_timer.c b/src/spdk/dpdk/lib/librte_timer/rte_timer.c new file mode 100644 index 00000000..590488c7 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_timer/rte_timer.c @@ -0,0 +1,617 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rte_timer.h" + +LIST_HEAD(rte_timer_list, rte_timer); + +struct priv_timer { + struct rte_timer pending_head; /**< dummy timer instance to head up list */ + rte_spinlock_t list_lock; /**< lock to protect list access */ + + /** per-core variable that true if a timer was updated on this + * core since last reset of the variable */ + int updated; + + /** track the current depth of the skiplist */ + unsigned curr_skiplist_depth; + + unsigned prev_lcore; /**< used for lcore round robin */ + + /** running timer on this lcore now */ + struct rte_timer *running_tim; + +#ifdef RTE_LIBRTE_TIMER_DEBUG + /** per-lcore statistics */ + struct rte_timer_debug_stats stats; +#endif +} __rte_cache_aligned; + +/** per-lcore private info for timers */ +static struct priv_timer priv_timer[RTE_MAX_LCORE]; + +/* when debug is enabled, store some statistics */ +#ifdef RTE_LIBRTE_TIMER_DEBUG +#define __TIMER_STAT_ADD(name, n) do { \ + unsigned __lcore_id = rte_lcore_id(); \ + if (__lcore_id < RTE_MAX_LCORE) \ + priv_timer[__lcore_id].stats.name += (n); \ + } while(0) +#else +#define __TIMER_STAT_ADD(name, n) do {} while(0) +#endif + +/* Init the timer library. */ +void +rte_timer_subsystem_init(void) +{ + unsigned lcore_id; + + /* since priv_timer is static, it's zeroed by default, so only init some + * fields. + */ + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) { + rte_spinlock_init(&priv_timer[lcore_id].list_lock); + priv_timer[lcore_id].prev_lcore = lcore_id; + } +} + +/* Initialize the timer handle tim for use */ +void +rte_timer_init(struct rte_timer *tim) +{ + union rte_timer_status status; + + status.state = RTE_TIMER_STOP; + status.owner = RTE_TIMER_NO_OWNER; + tim->status.u32 = status.u32; +} + +/* + * if timer is pending or stopped (or running on the same core than + * us), mark timer as configuring, and on success return the previous + * status of the timer + */ +static int +timer_set_config_state(struct rte_timer *tim, + union rte_timer_status *ret_prev_status) +{ + union rte_timer_status prev_status, status; + int success = 0; + unsigned lcore_id; + + lcore_id = rte_lcore_id(); + + /* wait that the timer is in correct status before update, + * and mark it as being configured */ + while (success == 0) { + prev_status.u32 = tim->status.u32; + + /* timer is running on another core + * or ready to run on local core, exit + */ + if (prev_status.state == RTE_TIMER_RUNNING && + (prev_status.owner != (uint16_t)lcore_id || + tim != priv_timer[lcore_id].running_tim)) + return -1; + + /* timer is being configured on another core */ + if (prev_status.state == RTE_TIMER_CONFIG) + return -1; + + /* here, we know that timer is stopped or pending, + * mark it atomically as being configured */ + status.state = RTE_TIMER_CONFIG; + status.owner = (int16_t)lcore_id; + success = rte_atomic32_cmpset(&tim->status.u32, + prev_status.u32, + status.u32); + } + + ret_prev_status->u32 = prev_status.u32; + return 0; +} + +/* + * if timer is pending, mark timer as running + */ +static int +timer_set_running_state(struct rte_timer *tim) +{ + union rte_timer_status prev_status, status; + unsigned lcore_id = rte_lcore_id(); + int success = 0; + + /* wait that the timer is in correct status before update, + * and mark it as running */ + while (success == 0) { + prev_status.u32 = tim->status.u32; + + /* timer is not pending anymore */ + if (prev_status.state != RTE_TIMER_PENDING) + return -1; + + /* here, we know that timer is stopped or pending, + * mark it atomically as being configured */ + status.state = RTE_TIMER_RUNNING; + status.owner = (int16_t)lcore_id; + success = rte_atomic32_cmpset(&tim->status.u32, + prev_status.u32, + status.u32); + } + + return 0; +} + +/* + * Return a skiplist level for a new entry. + * This probabilistically gives a level with p=1/4 that an entry at level n + * will also appear at level n+1. + */ +static uint32_t +timer_get_skiplist_level(unsigned curr_depth) +{ +#ifdef RTE_LIBRTE_TIMER_DEBUG + static uint32_t i, count = 0; + static uint32_t levels[MAX_SKIPLIST_DEPTH] = {0}; +#endif + + /* probability value is 1/4, i.e. all at level 0, 1 in 4 is at level 1, + * 1 in 16 at level 2, 1 in 64 at level 3, etc. Calculated using lowest + * bit position of a (pseudo)random number. + */ + uint32_t rand = rte_rand() & (UINT32_MAX - 1); + uint32_t level = rand == 0 ? MAX_SKIPLIST_DEPTH : (rte_bsf32(rand)-1) / 2; + + /* limit the levels used to one above our current level, so we don't, + * for instance, have a level 0 and a level 7 without anything between + */ + if (level > curr_depth) + level = curr_depth; + if (level >= MAX_SKIPLIST_DEPTH) + level = MAX_SKIPLIST_DEPTH-1; +#ifdef RTE_LIBRTE_TIMER_DEBUG + count ++; + levels[level]++; + if (count % 10000 == 0) + for (i = 0; i < MAX_SKIPLIST_DEPTH; i++) + printf("Level %u: %u\n", (unsigned)i, (unsigned)levels[i]); +#endif + return level; +} + +/* + * For a given time value, get the entries at each level which + * are <= that time value. + */ +static void +timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore, + struct rte_timer **prev) +{ + unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth; + prev[lvl] = &priv_timer[tim_lcore].pending_head; + while(lvl != 0) { + lvl--; + prev[lvl] = prev[lvl+1]; + while (prev[lvl]->sl_next[lvl] && + prev[lvl]->sl_next[lvl]->expire <= time_val) + prev[lvl] = prev[lvl]->sl_next[lvl]; + } +} + +/* + * Given a timer node in the skiplist, find the previous entries for it at + * all skiplist levels. + */ +static void +timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore, + struct rte_timer **prev) +{ + int i; + /* to get a specific entry in the list, look for just lower than the time + * values, and then increment on each level individually if necessary + */ + timer_get_prev_entries(tim->expire - 1, tim_lcore, prev); + for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) { + while (prev[i]->sl_next[i] != NULL && + prev[i]->sl_next[i] != tim && + prev[i]->sl_next[i]->expire <= tim->expire) + prev[i] = prev[i]->sl_next[i]; + } +} + +/* + * add in list, lock if needed + * timer must be in config state + * timer must not be in a list + */ +static void +timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked) +{ + unsigned lcore_id = rte_lcore_id(); + unsigned lvl; + struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + + /* if timer needs to be scheduled on another core, we need to + * lock the list; if it is on local core, we need to lock if + * we are not called from rte_timer_manage() */ + if (tim_lcore != lcore_id || !local_is_locked) + rte_spinlock_lock(&priv_timer[tim_lcore].list_lock); + + /* find where exactly this element goes in the list of elements + * for each depth. */ + timer_get_prev_entries(tim->expire, tim_lcore, prev); + + /* now assign it a new level and add at that level */ + const unsigned tim_level = timer_get_skiplist_level( + priv_timer[tim_lcore].curr_skiplist_depth); + if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth) + priv_timer[tim_lcore].curr_skiplist_depth++; + + lvl = tim_level; + while (lvl > 0) { + tim->sl_next[lvl] = prev[lvl]->sl_next[lvl]; + prev[lvl]->sl_next[lvl] = tim; + lvl--; + } + tim->sl_next[0] = prev[0]->sl_next[0]; + prev[0]->sl_next[0] = tim; + + /* save the lowest list entry into the expire field of the dummy hdr + * NOTE: this is not atomic on 32-bit*/ + priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\ + pending_head.sl_next[0]->expire; + + if (tim_lcore != lcore_id || !local_is_locked) + rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock); +} + +/* + * del from list, lock if needed + * timer must be in config state + * timer must be in a list + */ +static void +timer_del(struct rte_timer *tim, union rte_timer_status prev_status, + int local_is_locked) +{ + unsigned lcore_id = rte_lcore_id(); + unsigned prev_owner = prev_status.owner; + int i; + struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1]; + + /* if timer needs is pending another core, we need to lock the + * list; if it is on local core, we need to lock if we are not + * called from rte_timer_manage() */ + if (prev_owner != lcore_id || !local_is_locked) + rte_spinlock_lock(&priv_timer[prev_owner].list_lock); + + /* save the lowest list entry into the expire field of the dummy hdr. + * NOTE: this is not atomic on 32-bit */ + if (tim == priv_timer[prev_owner].pending_head.sl_next[0]) + priv_timer[prev_owner].pending_head.expire = + ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire); + + /* adjust pointers from previous entries to point past this */ + timer_get_prev_entries_for_node(tim, prev_owner, prev); + for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) { + if (prev[i]->sl_next[i] == tim) + prev[i]->sl_next[i] = tim->sl_next[i]; + } + + /* in case we deleted last entry at a level, adjust down max level */ + for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) + if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL) + priv_timer[prev_owner].curr_skiplist_depth --; + else + break; + + if (prev_owner != lcore_id || !local_is_locked) + rte_spinlock_unlock(&priv_timer[prev_owner].list_lock); +} + +/* Reset and start the timer associated with the timer handle (private func) */ +static int +__rte_timer_reset(struct rte_timer *tim, uint64_t expire, + uint64_t period, unsigned tim_lcore, + rte_timer_cb_t fct, void *arg, + int local_is_locked) +{ + union rte_timer_status prev_status, status; + int ret; + unsigned lcore_id = rte_lcore_id(); + + /* round robin for tim_lcore */ + if (tim_lcore == (unsigned)LCORE_ID_ANY) { + if (lcore_id < RTE_MAX_LCORE) { + /* EAL thread with valid lcore_id */ + tim_lcore = rte_get_next_lcore( + priv_timer[lcore_id].prev_lcore, + 0, 1); + priv_timer[lcore_id].prev_lcore = tim_lcore; + } else + /* non-EAL thread do not run rte_timer_manage(), + * so schedule the timer on the first enabled lcore. */ + tim_lcore = rte_get_next_lcore(LCORE_ID_ANY, 0, 1); + } + + /* wait that the timer is in correct status before update, + * and mark it as being configured */ + ret = timer_set_config_state(tim, &prev_status); + if (ret < 0) + return -1; + + __TIMER_STAT_ADD(reset, 1); + if (prev_status.state == RTE_TIMER_RUNNING && + lcore_id < RTE_MAX_LCORE) { + priv_timer[lcore_id].updated = 1; + } + + /* remove it from list */ + if (prev_status.state == RTE_TIMER_PENDING) { + timer_del(tim, prev_status, local_is_locked); + __TIMER_STAT_ADD(pending, -1); + } + + tim->period = period; + tim->expire = expire; + tim->f = fct; + tim->arg = arg; + + __TIMER_STAT_ADD(pending, 1); + timer_add(tim, tim_lcore, local_is_locked); + + /* update state: as we are in CONFIG state, only us can modify + * the state so we don't need to use cmpset() here */ + rte_wmb(); + status.state = RTE_TIMER_PENDING; + status.owner = (int16_t)tim_lcore; + tim->status.u32 = status.u32; + + return 0; +} + +/* Reset and start the timer associated with the timer handle tim */ +int +rte_timer_reset(struct rte_timer *tim, uint64_t ticks, + enum rte_timer_type type, unsigned tim_lcore, + rte_timer_cb_t fct, void *arg) +{ + uint64_t cur_time = rte_get_timer_cycles(); + uint64_t period; + + if (unlikely((tim_lcore != (unsigned)LCORE_ID_ANY) && + !(rte_lcore_is_enabled(tim_lcore) || + rte_lcore_has_role(tim_lcore, ROLE_SERVICE)))) + return -1; + + if (type == PERIODICAL) + period = ticks; + else + period = 0; + + return __rte_timer_reset(tim, cur_time + ticks, period, tim_lcore, + fct, arg, 0); +} + +/* loop until rte_timer_reset() succeed */ +void +rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks, + enum rte_timer_type type, unsigned tim_lcore, + rte_timer_cb_t fct, void *arg) +{ + while (rte_timer_reset(tim, ticks, type, tim_lcore, + fct, arg) != 0) + rte_pause(); +} + +/* Stop the timer associated with the timer handle tim */ +int +rte_timer_stop(struct rte_timer *tim) +{ + union rte_timer_status prev_status, status; + unsigned lcore_id = rte_lcore_id(); + int ret; + + /* wait that the timer is in correct status before update, + * and mark it as being configured */ + ret = timer_set_config_state(tim, &prev_status); + if (ret < 0) + return -1; + + __TIMER_STAT_ADD(stop, 1); + if (prev_status.state == RTE_TIMER_RUNNING && + lcore_id < RTE_MAX_LCORE) { + priv_timer[lcore_id].updated = 1; + } + + /* remove it from list */ + if (prev_status.state == RTE_TIMER_PENDING) { + timer_del(tim, prev_status, 0); + __TIMER_STAT_ADD(pending, -1); + } + + /* mark timer as stopped */ + rte_wmb(); + status.state = RTE_TIMER_STOP; + status.owner = RTE_TIMER_NO_OWNER; + tim->status.u32 = status.u32; + + return 0; +} + +/* loop until rte_timer_stop() succeed */ +void +rte_timer_stop_sync(struct rte_timer *tim) +{ + while (rte_timer_stop(tim) != 0) + rte_pause(); +} + +/* Test the PENDING status of the timer handle tim */ +int +rte_timer_pending(struct rte_timer *tim) +{ + return tim->status.state == RTE_TIMER_PENDING; +} + +/* must be called periodically, run all timer that expired */ +void rte_timer_manage(void) +{ + union rte_timer_status status; + struct rte_timer *tim, *next_tim; + struct rte_timer *run_first_tim, **pprev; + unsigned lcore_id = rte_lcore_id(); + struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1]; + uint64_t cur_time; + int i, ret; + + /* timer manager only runs on EAL thread with valid lcore_id */ + assert(lcore_id < RTE_MAX_LCORE); + + __TIMER_STAT_ADD(manage, 1); + /* optimize for the case where per-cpu list is empty */ + if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) + return; + cur_time = rte_get_timer_cycles(); + +#ifdef RTE_ARCH_64 + /* on 64-bit the value cached in the pending_head.expired will be + * updated atomically, so we can consult that for a quick check here + * outside the lock */ + if (likely(priv_timer[lcore_id].pending_head.expire > cur_time)) + return; +#endif + + /* browse ordered list, add expired timers in 'expired' list */ + rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + + /* if nothing to do just unlock and return */ + if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL || + priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) { + rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + return; + } + + /* save start of list of expired timers */ + tim = priv_timer[lcore_id].pending_head.sl_next[0]; + + /* break the existing list at current time point */ + timer_get_prev_entries(cur_time, lcore_id, prev); + for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) { + if (prev[i] == &priv_timer[lcore_id].pending_head) + continue; + priv_timer[lcore_id].pending_head.sl_next[i] = + prev[i]->sl_next[i]; + if (prev[i]->sl_next[i] == NULL) + priv_timer[lcore_id].curr_skiplist_depth--; + prev[i] ->sl_next[i] = NULL; + } + + /* transition run-list from PENDING to RUNNING */ + run_first_tim = tim; + pprev = &run_first_tim; + + for ( ; tim != NULL; tim = next_tim) { + next_tim = tim->sl_next[0]; + + ret = timer_set_running_state(tim); + if (likely(ret == 0)) { + pprev = &tim->sl_next[0]; + } else { + /* another core is trying to re-config this one, + * remove it from local expired list + */ + *pprev = next_tim; + } + } + + /* update the next to expire timer value */ + priv_timer[lcore_id].pending_head.expire = + (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 : + priv_timer[lcore_id].pending_head.sl_next[0]->expire; + + rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + + /* now scan expired list and call callbacks */ + for (tim = run_first_tim; tim != NULL; tim = next_tim) { + next_tim = tim->sl_next[0]; + priv_timer[lcore_id].updated = 0; + priv_timer[lcore_id].running_tim = tim; + + /* execute callback function with list unlocked */ + tim->f(tim, tim->arg); + + __TIMER_STAT_ADD(pending, -1); + /* the timer was stopped or reloaded by the callback + * function, we have nothing to do here */ + if (priv_timer[lcore_id].updated == 1) + continue; + + if (tim->period == 0) { + /* remove from done list and mark timer as stopped */ + status.state = RTE_TIMER_STOP; + status.owner = RTE_TIMER_NO_OWNER; + rte_wmb(); + tim->status.u32 = status.u32; + } + else { + /* keep it in list and mark timer as pending */ + rte_spinlock_lock(&priv_timer[lcore_id].list_lock); + status.state = RTE_TIMER_PENDING; + __TIMER_STAT_ADD(pending, 1); + status.owner = (int16_t)lcore_id; + rte_wmb(); + tim->status.u32 = status.u32; + __rte_timer_reset(tim, tim->expire + tim->period, + tim->period, lcore_id, tim->f, tim->arg, 1); + rte_spinlock_unlock(&priv_timer[lcore_id].list_lock); + } + } + priv_timer[lcore_id].running_tim = NULL; +} + +/* dump statistics about timers */ +void rte_timer_dump_stats(FILE *f) +{ +#ifdef RTE_LIBRTE_TIMER_DEBUG + struct rte_timer_debug_stats sum; + unsigned lcore_id; + + memset(&sum, 0, sizeof(sum)); + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + sum.reset += priv_timer[lcore_id].stats.reset; + sum.stop += priv_timer[lcore_id].stats.stop; + sum.manage += priv_timer[lcore_id].stats.manage; + sum.pending += priv_timer[lcore_id].stats.pending; + } + fprintf(f, "Timer statistics:\n"); + fprintf(f, " reset = %"PRIu64"\n", sum.reset); + fprintf(f, " stop = %"PRIu64"\n", sum.stop); + fprintf(f, " manage = %"PRIu64"\n", sum.manage); + fprintf(f, " pending = %"PRIu64"\n", sum.pending); +#else + fprintf(f, "No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n"); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_timer/rte_timer.h b/src/spdk/dpdk/lib/librte_timer/rte_timer.h new file mode 100644 index 00000000..9b95cd2c --- /dev/null +++ b/src/spdk/dpdk/lib/librte_timer/rte_timer.h @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_TIMER_H_ +#define _RTE_TIMER_H_ + +/** + * @file + RTE Timer + * + * This library provides a timer service to RTE Data Plane execution + * units that allows the execution of callback functions asynchronously. + * + * - Timers can be periodic or single (one-shot). + * - The timers can be loaded from one core and executed on another. This has + * to be specified in the call to rte_timer_reset(). + * - High precision is possible. NOTE: this depends on the call frequency to + * rte_timer_manage() that check the timer expiration for the local core. + * - If not used in an application, for improved performance, it can be + * disabled at compilation time by not calling the rte_timer_manage() + * to improve performance. + * + * The timer library uses the rte_get_hpet_cycles() function that + * uses the HPET, when available, to provide a reliable time reference. [HPET + * routines are provided by EAL, which falls back to using the chip TSC (time- + * stamp counter) as fallback when HPET is not available] + * + * This library provides an interface to add, delete and restart a + * timer. The API is based on the BSD callout(9) API with a few + * differences. + * + * See the RTE architecture documentation for more information about the + * design of this library. + */ + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_TIMER_STOP 0 /**< State: timer is stopped. */ +#define RTE_TIMER_PENDING 1 /**< State: timer is scheduled. */ +#define RTE_TIMER_RUNNING 2 /**< State: timer function is running. */ +#define RTE_TIMER_CONFIG 3 /**< State: timer is being configured. */ + +#define RTE_TIMER_NO_OWNER -2 /**< Timer has no owner. */ + +/** + * Timer type: Periodic or single (one-shot). + */ +enum rte_timer_type { + SINGLE, + PERIODICAL +}; + +/** + * Timer status: A union of the state (stopped, pending, running, + * config) and an owner (the id of the lcore that owns the timer). + */ +union rte_timer_status { + RTE_STD_C11 + struct { + uint16_t state; /**< Stop, pending, running, config. */ + int16_t owner; /**< The lcore that owns the timer. */ + }; + uint32_t u32; /**< To atomic-set status + owner. */ +}; + +#ifdef RTE_LIBRTE_TIMER_DEBUG +/** + * A structure that stores the timer statistics (per-lcore). + */ +struct rte_timer_debug_stats { + uint64_t reset; /**< Number of success calls to rte_timer_reset(). */ + uint64_t stop; /**< Number of success calls to rte_timer_stop(). */ + uint64_t manage; /**< Number of calls to rte_timer_manage(). */ + uint64_t pending; /**< Number of pending/running timers. */ +}; +#endif + +struct rte_timer; + +/** + * Callback function type for timer expiry. + */ +typedef void (*rte_timer_cb_t)(struct rte_timer *, void *); + +#define MAX_SKIPLIST_DEPTH 10 + +/** + * A structure describing a timer in RTE. + */ +struct rte_timer +{ + uint64_t expire; /**< Time when timer expire. */ + struct rte_timer *sl_next[MAX_SKIPLIST_DEPTH]; + volatile union rte_timer_status status; /**< Status of timer. */ + uint64_t period; /**< Period of timer (0 if not periodic). */ + rte_timer_cb_t f; /**< Callback function. */ + void *arg; /**< Argument to callback function. */ +}; + + +#ifdef __cplusplus +/** + * A C++ static initializer for a timer structure. + */ +#define RTE_TIMER_INITIALIZER { \ + 0, \ + {NULL}, \ + {{RTE_TIMER_STOP, RTE_TIMER_NO_OWNER}}, \ + 0, \ + NULL, \ + NULL, \ + } +#else +/** + * A static initializer for a timer structure. + */ +#define RTE_TIMER_INITIALIZER { \ + .status = {{ \ + .state = RTE_TIMER_STOP, \ + .owner = RTE_TIMER_NO_OWNER, \ + }}, \ + } +#endif + +/** + * Initialize the timer library. + * + * Initializes internal variables (list, locks and so on) for the RTE + * timer library. + */ +void rte_timer_subsystem_init(void); + +/** + * Initialize a timer handle. + * + * The rte_timer_init() function initializes the timer handle *tim* + * for use. No operations can be performed on a timer before it is + * initialized. + * + * @param tim + * The timer to initialize. + */ +void rte_timer_init(struct rte_timer *tim); + +/** + * Reset and start the timer associated with the timer handle. + * + * The rte_timer_reset() function resets and starts the timer + * associated with the timer handle *tim*. When the timer expires after + * *ticks* HPET cycles, the function specified by *fct* will be called + * with the argument *arg* on core *tim_lcore*. + * + * If the timer associated with the timer handle is already running + * (in the RUNNING state), the function will fail. The user has to check + * the return value of the function to see if there is a chance that the + * timer is in the RUNNING state. + * + * If the timer is being configured on another core (the CONFIG state), + * it will also fail. + * + * If the timer is pending or stopped, it will be rescheduled with the + * new parameters. + * + * @param tim + * The timer handle. + * @param ticks + * The number of cycles (see rte_get_hpet_hz()) before the callback + * function is called. + * @param type + * The type can be either: + * - PERIODICAL: The timer is automatically reloaded after execution + * (returns to the PENDING state) + * - SINGLE: The timer is one-shot, that is, the timer goes to a + * STOPPED state after execution. + * @param tim_lcore + * The ID of the lcore where the timer callback function has to be + * executed. If tim_lcore is LCORE_ID_ANY, the timer library will + * launch it on a different core for each call (round-robin). + * @param fct + * The callback function of the timer. + * @param arg + * The user argument of the callback function. + * @return + * - 0: Success; the timer is scheduled. + * - (-1): Timer is in the RUNNING or CONFIG state. + */ +int rte_timer_reset(struct rte_timer *tim, uint64_t ticks, + enum rte_timer_type type, unsigned tim_lcore, + rte_timer_cb_t fct, void *arg); + + +/** + * Loop until rte_timer_reset() succeeds. + * + * Reset and start the timer associated with the timer handle. Always + * succeed. See rte_timer_reset() for details. + * + * @param tim + * The timer handle. + * @param ticks + * The number of cycles (see rte_get_hpet_hz()) before the callback + * function is called. + * @param type + * The type can be either: + * - PERIODICAL: The timer is automatically reloaded after execution + * (returns to the PENDING state) + * - SINGLE: The timer is one-shot, that is, the timer goes to a + * STOPPED state after execution. + * @param tim_lcore + * The ID of the lcore where the timer callback function has to be + * executed. If tim_lcore is LCORE_ID_ANY, the timer library will + * launch it on a different core for each call (round-robin). + * @param fct + * The callback function of the timer. + * @param arg + * The user argument of the callback function. + */ +void +rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks, + enum rte_timer_type type, unsigned tim_lcore, + rte_timer_cb_t fct, void *arg); + +/** + * Stop a timer. + * + * The rte_timer_stop() function stops the timer associated with the + * timer handle *tim*. It may fail if the timer is currently running or + * being configured. + * + * If the timer is pending or stopped (for instance, already expired), + * the function will succeed. The timer handle tim must have been + * initialized using rte_timer_init(), otherwise, undefined behavior + * will occur. + * + * This function can be called safely from a timer callback. If it + * succeeds, the timer is not referenced anymore by the timer library + * and the timer structure can be freed (even in the callback + * function). + * + * @param tim + * The timer handle. + * @return + * - 0: Success; the timer is stopped. + * - (-1): The timer is in the RUNNING or CONFIG state. + */ +int rte_timer_stop(struct rte_timer *tim); + + +/** + * Loop until rte_timer_stop() succeeds. + * + * After a call to this function, the timer identified by *tim* is + * stopped. See rte_timer_stop() for details. + * + * @param tim + * The timer handle. + */ +void rte_timer_stop_sync(struct rte_timer *tim); + +/** + * Test if a timer is pending. + * + * The rte_timer_pending() function tests the PENDING status + * of the timer handle *tim*. A PENDING timer is one that has been + * scheduled and whose function has not yet been called. + * + * @param tim + * The timer handle. + * @return + * - 0: The timer is not pending. + * - 1: The timer is pending. + */ +int rte_timer_pending(struct rte_timer *tim); + +/** + * Manage the timer list and execute callback functions. + * + * This function must be called periodically from EAL lcores + * main_loop(). It browses the list of pending timers and runs all + * timers that are expired. + * + * The precision of the timer depends on the call frequency of this + * function. However, the more often the function is called, the more + * CPU resources it will use. + */ +void rte_timer_manage(void); + +/** + * Dump statistics about timers. + * + * @param f + * A pointer to a file for output + */ +void rte_timer_dump_stats(FILE *f); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_TIMER_H_ */ diff --git a/src/spdk/dpdk/lib/librte_timer/rte_timer_version.map b/src/spdk/dpdk/lib/librte_timer/rte_timer_version.map new file mode 100644 index 00000000..9b2e4b86 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_timer/rte_timer_version.map @@ -0,0 +1,15 @@ +DPDK_2.0 { + global: + + rte_timer_dump_stats; + rte_timer_init; + rte_timer_manage; + rte_timer_pending; + rte_timer_reset; + rte_timer_reset_sync; + rte_timer_stop; + rte_timer_stop_sync; + rte_timer_subsystem_init; + + local: *; +}; diff --git a/src/spdk/dpdk/lib/librte_vhost/Makefile b/src/spdk/dpdk/lib/librte_vhost/Makefile new file mode 100644 index 00000000..de431fbb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/Makefile @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_vhost.a + +EXPORT_MAP := rte_vhost_version.map + +LIBABIVER := 4 + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 +CFLAGS += -I vhost_user +LDLIBS += -lpthread + +ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) +LDLIBS += -lnuma +endif +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net \ + -lrte_cryptodev -lrte_hash + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \ + vhost_user.c virtio_net.c vdpa.c + +# install includes +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h + +# only compile vhost crypto when cryptodev is enabled +ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_crypto.c +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost_crypto.h +endif + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_vhost/fd_man.c b/src/spdk/dpdk/lib/librte_vhost/fd_man.c new file mode 100644 index 00000000..38347ab1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/fd_man.c @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fd_man.h" + + +#define RTE_LOGTYPE_VHOST_FDMAN RTE_LOGTYPE_USER1 + +#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) + +static int +get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) +{ + int i; + + for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) + ; + + return i; +} + +static void +fdset_move(struct fdset *pfdset, int dst, int src) +{ + pfdset->fd[dst] = pfdset->fd[src]; + pfdset->rwfds[dst] = pfdset->rwfds[src]; +} + +static void +fdset_shrink_nolock(struct fdset *pfdset) +{ + int i; + int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); + + for (i = 0; i < last_valid_idx; i++) { + if (pfdset->fd[i].fd != -1) + continue; + + fdset_move(pfdset, i, last_valid_idx); + last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); + } + pfdset->num = last_valid_idx + 1; +} + +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * Returns the index in the fdset for a given fd. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) + ; + + return i == pfdset->num ? -1 : i; +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + struct pollfd *pfd = &pfdset->rwfds[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + pfd->fd = fd; + pfd->events = rcb ? POLLIN : 0; + pfd->events |= wcb ? POLLOUT : 0; + pfd->revents = 0; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + } + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + +/** + * Unregister the fd from the fdset. + * + * If parameters are invalid, return directly -2. + * And check whether fd is busy, if yes, return -1. + * Otherwise, try to delete the fd from fdset and + * return true. + */ +int +fdset_try_del(struct fdset *pfdset, int fd) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -2; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -1; + } + + if (i != -1) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + } + + pthread_mutex_unlock(&pfdset->fd_mutex); + return 0; +} + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void * +fdset_event_dispatch(void *arg) +{ + int i; + struct pollfd *pfd; + struct fdentry *pfdentry; + fd_cb rcb, wcb; + void *dat; + int fd, numfds; + int remove1, remove2; + int need_shrink; + struct fdset *pfdset = arg; + int val; + + if (pfdset == NULL) + return NULL; + + while (1) { + + /* + * When poll is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When poll returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + pthread_mutex_lock(&pfdset->fd_mutex); + numfds = pfdset->num; + pthread_mutex_unlock(&pfdset->fd_mutex); + + val = poll(pfdset->rwfds, numfds, 1000 /* millisecs */); + if (val < 0) + continue; + + need_shrink = 0; + for (i = 0; i < numfds; i++) { + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + pfd = &pfdset->rwfds[i]; + + if (fd < 0) { + need_shrink = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + if (!pfd->revents) { + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + remove1 = remove2 = 0; + + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) + rcb(fd, dat, &remove1); + if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fdset_del. + */ + if (remove1 || remove2) { + pfdentry->fd = -1; + need_shrink = 1; + } + } + + if (need_shrink) + fdset_shrink(pfdset); + } + + return NULL; +} + +static void +fdset_pipe_read_cb(int readfd, void *dat __rte_unused, + int *remove __rte_unused) +{ + char charbuf[16]; + int r = read(readfd, charbuf, sizeof(charbuf)); + /* + * Just an optimization, we don't care if read() failed + * so ignore explicitly its return value to make the + * compiler happy + */ + RTE_SET_USED(r); +} + +void +fdset_pipe_uninit(struct fdset *fdset) +{ + fdset_del(fdset, fdset->u.readfd); + close(fdset->u.readfd); + close(fdset->u.writefd); +} + +int +fdset_pipe_init(struct fdset *fdset) +{ + int ret; + + if (pipe(fdset->u.pipefd) < 0) { + RTE_LOG(ERR, VHOST_FDMAN, + "failed to create pipe for vhost fdset\n"); + return -1; + } + + ret = fdset_add(fdset, fdset->u.readfd, + fdset_pipe_read_cb, NULL, NULL); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_FDMAN, + "failed to add pipe readfd %d into vhost server fdset\n", + fdset->u.readfd); + + fdset_pipe_uninit(fdset); + return -1; + } + + return 0; +} + +void +fdset_pipe_notify(struct fdset *fdset) +{ + int r = write(fdset->u.writefd, "1", 1); + /* + * Just an optimization, we don't care if write() failed + * so ignore explicitly its return value to make the + * compiler happy + */ + RTE_SET_USED(r); + +} diff --git a/src/spdk/dpdk/lib/librte_vhost/fd_man.h b/src/spdk/dpdk/lib/librte_vhost/fd_man.h new file mode 100644 index 00000000..3331bcd9 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/fd_man.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include +#include +#include + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable.*/ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct pollfd rwfds[MAX_FDS]; + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ + + union pipefds { + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; + } u; +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); +int fdset_try_del(struct fdset *pfdset, int fd); + +void *fdset_event_dispatch(void *arg); + +int fdset_pipe_init(struct fdset *fdset); + +void fdset_pipe_uninit(struct fdset *fdset); + +void fdset_pipe_notify(struct fdset *fdset); + +#endif diff --git a/src/spdk/dpdk/lib/librte_vhost/iotlb.c b/src/spdk/dpdk/lib/librte_vhost/iotlb.c new file mode 100644 index 00000000..c6354fef --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/iotlb.c @@ -0,0 +1,364 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2017 Red Hat, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include + +#include "iotlb.h" +#include "vhost.h" + +struct vhost_iotlb_entry { + TAILQ_ENTRY(vhost_iotlb_entry) next; + + uint64_t iova; + uint64_t uaddr; + uint64_t size; + uint8_t perm; +}; + +#define IOTLB_CACHE_SIZE 2048 + +static void +vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq); + +static void +vhost_user_iotlb_pending_remove_all(struct vhost_virtqueue *vq) +{ + struct vhost_iotlb_entry *node, *temp_node; + + rte_rwlock_write_lock(&vq->iotlb_pending_lock); + + TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) { + TAILQ_REMOVE(&vq->iotlb_pending_list, node, next); + rte_mempool_put(vq->iotlb_pool, node); + } + + rte_rwlock_write_unlock(&vq->iotlb_pending_lock); +} + +bool +vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova, + uint8_t perm) +{ + struct vhost_iotlb_entry *node; + bool found = false; + + rte_rwlock_read_lock(&vq->iotlb_pending_lock); + + TAILQ_FOREACH(node, &vq->iotlb_pending_list, next) { + if ((node->iova == iova) && (node->perm == perm)) { + found = true; + break; + } + } + + rte_rwlock_read_unlock(&vq->iotlb_pending_lock); + + return found; +} + +void +vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq, + uint64_t iova, uint8_t perm) +{ + struct vhost_iotlb_entry *node; + int ret; + + ret = rte_mempool_get(vq->iotlb_pool, (void **)&node); + if (ret) { + RTE_LOG(DEBUG, VHOST_CONFIG, "IOTLB pool empty, clear entries\n"); + if (!TAILQ_EMPTY(&vq->iotlb_pending_list)) + vhost_user_iotlb_pending_remove_all(vq); + else + vhost_user_iotlb_cache_random_evict(vq); + ret = rte_mempool_get(vq->iotlb_pool, (void **)&node); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, "IOTLB pool still empty, failure\n"); + return; + } + } + + node->iova = iova; + node->perm = perm; + + rte_rwlock_write_lock(&vq->iotlb_pending_lock); + + TAILQ_INSERT_TAIL(&vq->iotlb_pending_list, node, next); + + rte_rwlock_write_unlock(&vq->iotlb_pending_lock); +} + +void +vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, + uint64_t iova, uint64_t size, uint8_t perm) +{ + struct vhost_iotlb_entry *node, *temp_node; + + rte_rwlock_write_lock(&vq->iotlb_pending_lock); + + TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) { + if (node->iova < iova) + continue; + if (node->iova >= iova + size) + continue; + if ((node->perm & perm) != node->perm) + continue; + TAILQ_REMOVE(&vq->iotlb_pending_list, node, next); + rte_mempool_put(vq->iotlb_pool, node); + } + + rte_rwlock_write_unlock(&vq->iotlb_pending_lock); +} + +static void +vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq) +{ + struct vhost_iotlb_entry *node, *temp_node; + + rte_rwlock_write_lock(&vq->iotlb_lock); + + TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { + TAILQ_REMOVE(&vq->iotlb_list, node, next); + rte_mempool_put(vq->iotlb_pool, node); + } + + vq->iotlb_cache_nr = 0; + + rte_rwlock_write_unlock(&vq->iotlb_lock); +} + +static void +vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq) +{ + struct vhost_iotlb_entry *node, *temp_node; + int entry_idx; + + rte_rwlock_write_lock(&vq->iotlb_lock); + + entry_idx = rte_rand() % vq->iotlb_cache_nr; + + TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { + if (!entry_idx) { + TAILQ_REMOVE(&vq->iotlb_list, node, next); + rte_mempool_put(vq->iotlb_pool, node); + vq->iotlb_cache_nr--; + break; + } + entry_idx--; + } + + rte_rwlock_write_unlock(&vq->iotlb_lock); +} + +void +vhost_user_iotlb_cache_insert(struct vhost_virtqueue *vq, uint64_t iova, + uint64_t uaddr, uint64_t size, uint8_t perm) +{ + struct vhost_iotlb_entry *node, *new_node; + int ret; + + ret = rte_mempool_get(vq->iotlb_pool, (void **)&new_node); + if (ret) { + RTE_LOG(DEBUG, VHOST_CONFIG, "IOTLB pool empty, clear entries\n"); + if (!TAILQ_EMPTY(&vq->iotlb_list)) + vhost_user_iotlb_cache_random_evict(vq); + else + vhost_user_iotlb_pending_remove_all(vq); + ret = rte_mempool_get(vq->iotlb_pool, (void **)&new_node); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, "IOTLB pool still empty, failure\n"); + return; + } + } + + new_node->iova = iova; + new_node->uaddr = uaddr; + new_node->size = size; + new_node->perm = perm; + + rte_rwlock_write_lock(&vq->iotlb_lock); + + TAILQ_FOREACH(node, &vq->iotlb_list, next) { + /* + * Entries must be invalidated before being updated. + * So if iova already in list, assume identical. + */ + if (node->iova == new_node->iova) { + rte_mempool_put(vq->iotlb_pool, new_node); + goto unlock; + } else if (node->iova > new_node->iova) { + TAILQ_INSERT_BEFORE(node, new_node, next); + vq->iotlb_cache_nr++; + goto unlock; + } + } + + TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next); + vq->iotlb_cache_nr++; + +unlock: + vhost_user_iotlb_pending_remove(vq, iova, size, perm); + + rte_rwlock_write_unlock(&vq->iotlb_lock); + +} + +void +vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, + uint64_t iova, uint64_t size) +{ + struct vhost_iotlb_entry *node, *temp_node; + + if (unlikely(!size)) + return; + + rte_rwlock_write_lock(&vq->iotlb_lock); + + TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) { + /* Sorted list */ + if (unlikely(iova + size < node->iova)) + break; + + if (iova < node->iova + node->size) { + TAILQ_REMOVE(&vq->iotlb_list, node, next); + rte_mempool_put(vq->iotlb_pool, node); + vq->iotlb_cache_nr--; + } + } + + rte_rwlock_write_unlock(&vq->iotlb_lock); +} + +uint64_t +vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova, + uint64_t *size, uint8_t perm) +{ + struct vhost_iotlb_entry *node; + uint64_t offset, vva = 0, mapped = 0; + + if (unlikely(!*size)) + goto out; + + TAILQ_FOREACH(node, &vq->iotlb_list, next) { + /* List sorted by iova */ + if (unlikely(iova < node->iova)) + break; + + if (iova >= node->iova + node->size) + continue; + + if (unlikely((perm & node->perm) != perm)) { + vva = 0; + break; + } + + offset = iova - node->iova; + if (!vva) + vva = node->uaddr + offset; + + mapped += node->size - offset; + iova = node->iova + node->size; + + if (mapped >= *size) + break; + } + +out: + /* Only part of the requested chunk is mapped */ + if (unlikely(mapped < *size)) + *size = mapped; + + return vva; +} + +void +vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq) +{ + vhost_user_iotlb_cache_remove_all(vq); + vhost_user_iotlb_pending_remove_all(vq); +} + +int +vhost_user_iotlb_init(struct virtio_net *dev, int vq_index) +{ + char pool_name[RTE_MEMPOOL_NAMESIZE]; + struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; + int socket = 0; + + if (vq->iotlb_pool) { + /* + * The cache has already been initialized, + * just drop all cached and pending entries. + */ + vhost_user_iotlb_flush_all(vq); + } + +#ifdef RTE_LIBRTE_VHOST_NUMA + if (get_mempolicy(&socket, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR) != 0) + socket = 0; +#endif + + rte_rwlock_init(&vq->iotlb_lock); + rte_rwlock_init(&vq->iotlb_pending_lock); + + TAILQ_INIT(&vq->iotlb_list); + TAILQ_INIT(&vq->iotlb_pending_list); + + snprintf(pool_name, sizeof(pool_name), "iotlb_cache_%d_%d", + dev->vid, vq_index); + + /* If already created, free it and recreate */ + vq->iotlb_pool = rte_mempool_lookup(pool_name); + if (vq->iotlb_pool) + rte_mempool_free(vq->iotlb_pool); + + vq->iotlb_pool = rte_mempool_create(pool_name, + IOTLB_CACHE_SIZE, sizeof(struct vhost_iotlb_entry), 0, + 0, 0, NULL, NULL, NULL, socket, + MEMPOOL_F_NO_CACHE_ALIGN | + MEMPOOL_F_SP_PUT | + MEMPOOL_F_SC_GET); + if (!vq->iotlb_pool) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to create IOTLB cache pool (%s)\n", + pool_name); + return -1; + } + + vq->iotlb_cache_nr = 0; + + return 0; +} + diff --git a/src/spdk/dpdk/lib/librte_vhost/iotlb.h b/src/spdk/dpdk/lib/librte_vhost/iotlb.h new file mode 100644 index 00000000..60b9e4c5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/iotlb.h @@ -0,0 +1,79 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2017 Red Hat, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _VHOST_IOTLB_H_ +#define _VHOST_IOTLB_H_ + +#include + +#include "vhost.h" + +static __rte_always_inline void +vhost_user_iotlb_rd_lock(struct vhost_virtqueue *vq) +{ + rte_rwlock_read_lock(&vq->iotlb_lock); +} + +static __rte_always_inline void +vhost_user_iotlb_rd_unlock(struct vhost_virtqueue *vq) +{ + rte_rwlock_read_unlock(&vq->iotlb_lock); +} + +static __rte_always_inline void +vhost_user_iotlb_wr_lock(struct vhost_virtqueue *vq) +{ + rte_rwlock_write_lock(&vq->iotlb_lock); +} + +static __rte_always_inline void +vhost_user_iotlb_wr_unlock(struct vhost_virtqueue *vq) +{ + rte_rwlock_write_unlock(&vq->iotlb_lock); +} + +void vhost_user_iotlb_cache_insert(struct vhost_virtqueue *vq, uint64_t iova, + uint64_t uaddr, uint64_t size, + uint8_t perm); +void vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq, + uint64_t iova, uint64_t size); +uint64_t vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova, + uint64_t *size, uint8_t perm); +bool vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova, + uint8_t perm); +void vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq, uint64_t iova, + uint8_t perm); +void vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, uint64_t iova, + uint64_t size, uint8_t perm); +void vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq); +int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index); + +#endif /* _VHOST_IOTLB_H_ */ diff --git a/src/spdk/dpdk/lib/librte_vhost/meson.build b/src/spdk/dpdk/lib/librte_vhost/meson.build new file mode 100644 index 00000000..bd62e0e3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017-2018 Intel Corporation + +if host_machine.system() != 'linux' + build = false +endif +if has_libnuma == 1 + dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) +endif +version = 4 +allow_experimental_apis = true +sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c', + 'vhost.c', 'vhost_user.c', + 'virtio_net.c', 'vhost_crypto.c') +headers = files('rte_vhost.h', 'rte_vdpa.h', 'rte_vhost_crypto.h') +deps += ['ethdev', 'cryptodev', 'hash', 'pci'] diff --git a/src/spdk/dpdk/lib/librte_vhost/rte_vdpa.h b/src/spdk/dpdk/lib/librte_vhost/rte_vdpa.h new file mode 100644 index 00000000..90465ca2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/rte_vdpa.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_VDPA_H_ +#define _RTE_VDPA_H_ + +/** + * @file + * + * Device specific vhost lib + */ + +#include +#include "rte_vhost.h" + +#define MAX_VDPA_NAME_LEN 128 + +enum vdpa_addr_type { + PCI_ADDR, + VDPA_ADDR_MAX +}; + +struct rte_vdpa_dev_addr { + enum vdpa_addr_type type; + union { + uint8_t __dummy[64]; + struct rte_pci_addr pci_addr; + }; +}; + +struct rte_vdpa_dev_ops { + /* Get capabilities of this device */ + int (*get_queue_num)(int did, uint32_t *queue_num); + int (*get_features)(int did, uint64_t *features); + int (*get_protocol_features)(int did, uint64_t *protocol_features); + + /* Driver configure/close the device */ + int (*dev_conf)(int vid); + int (*dev_close)(int vid); + + /* Enable/disable this vring */ + int (*set_vring_state)(int vid, int vring, int state); + + /* Set features when changed */ + int (*set_features)(int vid); + + /* Destination operations when migration done */ + int (*migration_done)(int vid); + + /* Get the vfio group fd */ + int (*get_vfio_group_fd)(int vid); + + /* Get the vfio device fd */ + int (*get_vfio_device_fd)(int vid); + + /* Get the notify area info of the queue */ + int (*get_notify_area)(int vid, int qid, + uint64_t *offset, uint64_t *size); + + /* Reserved for future extension */ + void *reserved[5]; +}; + +struct rte_vdpa_device { + struct rte_vdpa_dev_addr addr; + struct rte_vdpa_dev_ops *ops; +} __rte_cache_aligned; + +/* Register a vdpa device, return did if successful, -1 on failure */ +int __rte_experimental +rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr, + struct rte_vdpa_dev_ops *ops); + +/* Unregister a vdpa device, return -1 on failure */ +int __rte_experimental +rte_vdpa_unregister_device(int did); + +/* Find did of a vdpa device, return -1 on failure */ +int __rte_experimental +rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr); + +/* Find a vdpa device based on did */ +struct rte_vdpa_device * __rte_experimental +rte_vdpa_get_device(int did); + +#endif /* _RTE_VDPA_H_ */ diff --git a/src/spdk/dpdk/lib/librte_vhost/rte_vhost.h b/src/spdk/dpdk/lib/librte_vhost/rte_vhost.h new file mode 100644 index 00000000..b02673d4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/rte_vhost.h @@ -0,0 +1,653 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#ifndef _RTE_VHOST_H_ +#define _RTE_VHOST_H_ + +/** + * @file + * Interface to vhost-user + */ + +#include +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* These are not C++-aware. */ +#include +#include + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) +#define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3) + +/** Protocol features. */ +#ifndef VHOST_USER_PROTOCOL_F_MQ +#define VHOST_USER_PROTOCOL_F_MQ 0 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_RARP +#define VHOST_USER_PROTOCOL_F_RARP 2 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_NET_MTU +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ +#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION +#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD +#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 +#endif + +/** Indicate whether protocol features negotiation is supported. */ +#ifndef VHOST_USER_F_PROTOCOL_FEATURES +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#endif + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct rte_vhost_mem_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + +/** + * Memory structure includes region and mapping information. + */ +struct rte_vhost_memory { + uint32_t nregions; + struct rte_vhost_mem_region regions[]; +}; + +struct rte_vhost_vring { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + + /** Deprecated, use rte_vhost_vring_call() instead. */ + int callfd; + + int kickfd; + uint16_t size; +}; + +/** + * Device and vring operations. + */ +struct vhost_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + /** + * Features could be changed after the feature negotiation. + * For example, VHOST_F_LOG_ALL will be set/cleared at the + * start/end of live migration, respectively. This callback + * is used to inform the application on such change. + */ + int (*features_changed)(int vid, uint64_t features); + + int (*new_connection)(int vid); + void (*destroy_connection)(int vid); + + void *reserved[2]; /**< Reserved for future extension */ +}; + +/** + * Convert guest physical address to host virtual address + * + * This function is deprecated because unsafe. + * New rte_vhost_va_from_guest_pa() should be used instead to ensure + * guest physical ranges are fully and contiguously mapped into + * process virtual address space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @return + * the host virtual address on success, 0 on failure + */ +__rte_deprecated +static __rte_always_inline uint64_t +rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/** + * Convert guest physical address to host virtual address safely + * + * This variant of rte_vhost_gpa_to_vva() takes care all the + * requested length is mapped and contiguous in process address + * space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @param len + * the size of the requested area to map, updated with actual size mapped + * @return + * the host virtual address on success, 0 on failure + */ +static __rte_always_inline uint64_t +rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, + uint64_t gpa, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (gpa >= r->guest_phys_addr && + gpa < r->guest_phys_addr + r->size) { + + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) + *len = r->guest_phys_addr + r->size - gpa; + + return gpa - r->guest_phys_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + +#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) + +/** + * Log the memory write start with given address. + * + * This function only need be invoked when the live migration starts. + * Therefore, we won't need call it at all in the most of time. For + * making the performance impact be minimum, it's suggested to do a + * check before calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_write(vid, addr, len); + * + * @param vid + * vhost device ID + * @param addr + * the starting address for write + * @param len + * the length to write + */ +void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len); + +/** + * Log the used ring update start at given offset. + * + * Same as rte_vhost_log_write, it's suggested to do a check before + * calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_used_vring(vid, vring_idx, offset, len); + * + * @param vid + * vhost device ID + * @param vring_idx + * the vring index + * @param offset + * the offset inside the used ring + * @param len + * the length to write + */ +void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/** + * Set the vdpa device id, enforce single connection per socket + * + * @param path + * The vhost-user socket file path + * @param did + * Device id + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_attach_vdpa_device(const char *path, int did); + +/** + * Unset the vdpa device id + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_detach_vdpa_device(const char *path); + +/** + * Get the device id + * + * @param path + * The vhost-user socket file path + * @return + * Device id, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_get_vdpa_device_id(const char *path); + +/** + * Set the feature bits the vhost-user driver supports. + * + * @param path + * The vhost-user socket file path + * @param features + * Supported features + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_set_features(const char *path, uint64_t features); + +/** + * Enable vhost-user driver features. + * + * Note that + * - the param features should be a subset of the feature bits provided + * by rte_vhost_driver_set_features(). + * - it must be invoked before vhost-user negotiation starts. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to enable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_enable_features(const char *path, uint64_t features); + +/** + * Disable vhost-user driver features. + * + * The two notes at rte_vhost_driver_enable_features() also apply here. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to disable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_disable_features(const char *path, uint64_t features); + +/** + * Get the feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_get_features(const char *path, uint64_t *features); + +/** + * Get the protocol feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param protocol_features + * A pointer to store the queried protocol feature bits + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_get_protocol_features(const char *path, + uint64_t *protocol_features); + +/** + * Get the queue number bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param queue_num + * A pointer to store the queried queue number bits + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num); + +/** + * Get the feature bits after negotiation + * + * @param vid + * Vhost device ID + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_negotiated_features(int vid, uint64_t *features); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops); + +/** + * + * Start the vhost-user driver. + * + * This function triggers the vhost-user negotiation. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_start(const char *path); + +/** + * Get the MTU value of the device if set in QEMU. + * + * @param vid + * virtio-net device ID + * @param mtu + * The variable to store the MTU value + * + * @return + * 0: success + * -EAGAIN: device not yet started + * -ENOTSUP: device does not support MTU feature + */ +int rte_vhost_get_mtu(int vid, uint16_t *mtu); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * vhost device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * @deprecated + * Get the number of queues the device supports. + * + * Note this function is deprecated, as it returns a queue pair number, + * which is vhost specific. Instead, rte_vhost_get_vring_num should + * be used. + * + * @param vid + * vhost device ID + * + * @return + * The number of queues, 0 on failure + */ +__rte_deprecated +uint32_t rte_vhost_get_queue_num(int vid); + +/** + * Get the number of vrings the device supports. + * + * @param vid + * vhost device ID + * + * @return + * The number of vrings, 0 on failure + */ +uint16_t rte_vhost_get_vring_num(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * vhost device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +struct rte_mbuf; +struct rte_mempool; +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were successfully + * added to the RX queue. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +/** + * Get guest mem table: a list of memory regions. + * + * An rte_vhost_vhost_memory object will be allocated internaly, to hold the + * guest memory regions. Application should free it at destroy_device() + * callback. + * + * @param vid + * vhost device ID + * @param mem + * To store the returned mem regions + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); + +/** + * Get guest vring info, including the vring address, vring size, etc. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested vring info + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring); + +/** + * Notify the guest that used descriptors have been added to the vring. This + * function acts as a memory barrier. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_vring_call(int vid, uint16_t vring_idx); + +/** + * Get vhost RX queue avail count. + * + * @param vid + * vhost device ID + * @param qid + * virtio queue index in mq case + * @return + * num of desc available + */ +uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid); + +/** + * Get log base and log size of the vhost device + * + * @param vid + * vhost device ID + * @param log_base + * vhost log base + * @param log_size + * vhost log size + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size); + +/** + * Get last_avail/used_idx of the vhost virtqueue + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param last_avail_idx + * vhost last_avail_idx to get + * @param last_used_idx + * vhost last_used_idx to get + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_vring_base(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx); + +/** + * Set last_avail/used_idx of the vhost virtqueue + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param last_avail_idx + * last_avail_idx to set + * @param last_used_idx + * last_used_idx to set + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_vring_base(int vid, uint16_t queue_id, + uint16_t last_avail_idx, uint16_t last_used_idx); + +/** + * Get vdpa device id for vhost device. + * + * @param vid + * vhost device id + * @return + * device id + */ +int __rte_experimental +rte_vhost_get_vdpa_device_id(int vid); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VHOST_H_ */ diff --git a/src/spdk/dpdk/lib/librte_vhost/rte_vhost_crypto.h b/src/spdk/dpdk/lib/librte_vhost/rte_vhost_crypto.h new file mode 100644 index 00000000..f9fbc054 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/rte_vhost_crypto.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef _VHOST_CRYPTO_H_ +#define _VHOST_CRYPTO_H_ + +#define VHOST_CRYPTO_MBUF_POOL_SIZE (8192) +#define VHOST_CRYPTO_MAX_BURST_SIZE (64) +#define VHOST_CRYPTO_SESSION_MAP_ENTRIES (1024) /**< Max nb sessions */ +/** max nb virtual queues in a burst for finalizing*/ +#define VIRTIO_CRYPTO_MAX_NUM_BURST_VQS (64) + +enum rte_vhost_crypto_zero_copy { + RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE = 0, + RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE = 1, + RTE_VHOST_CRYPTO_MAX_ZERO_COPY_OPTIONS +}; + +/** + * Create Vhost-crypto instance + * + * @param vid + * The identifier of the vhost device. + * @param cryptodev_id + * The identifier of DPDK Cryptodev, the same cryptodev_id can be assigned to + * multiple Vhost-crypto devices. + * @param sess_pool + * The pointer to the created cryptodev session pool with the private data size + * matches the target DPDK Cryptodev. + * @param socket_id + * NUMA Socket ID to allocate resources on. * + * @return + * 0 if the Vhost Crypto Instance is created successfully. + * Negative integer if otherwise + */ +int __rte_experimental +rte_vhost_crypto_create(int vid, uint8_t cryptodev_id, + struct rte_mempool *sess_pool, int socket_id); + +/** + * Free the Vhost-crypto instance + * + * @param vid + * The identifier of the vhost device. + * @return + * 0 if the Vhost Crypto Instance is created successfully. + * Negative integer if otherwise. + */ +int __rte_experimental +rte_vhost_crypto_free(int vid); + +/** + * Enable or disable zero copy feature + * + * @param vid + * The identifier of the vhost device. + * @param option + * Flag of zero copy feature. + * @return + * 0 if completed successfully. + * Negative integer if otherwise. + */ +int __rte_experimental +rte_vhost_crypto_set_zero_copy(int vid, enum rte_vhost_crypto_zero_copy option); + +/** + * Fetch a number of vring descriptors from virt-queue and translate to DPDK + * crypto operations. After this function is executed, the user can enqueue + * the processed ops to the target cryptodev. + * + * @param vid + * The identifier of the vhost device. + * @param qid + * Virtio queue index. + * @param ops + * The address of an array of pointers to *rte_crypto_op* structures that must + * be large enough to store *nb_ops* pointers in it. + * @param nb_ops + * The maximum number of operations to be fetched and translated. + * @return + * The number of fetched and processed vhost crypto request operations. + */ +uint16_t __rte_experimental +rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + struct rte_crypto_op **ops, uint16_t nb_ops); +/** + * Finalize the dequeued crypto ops. After the translated crypto ops are + * dequeued from the cryptodev, this function shall be called to write the + * processed data back to the vring descriptor (if no-copy is turned off). + * + * @param ops + * The address of an array of *rte_crypto_op* structure that was dequeued + * from cryptodev. + * @param nb_ops + * The number of operations contained in the array. + * @callfds + * The callfd number(s) contained in this burst, this shall be an array with + * no less than VIRTIO_CRYPTO_MAX_NUM_BURST_VQS elements. + * @nb_callfds + * The number of call_fd numbers exist in the callfds. + * @return + * The number of ops processed. + */ +uint16_t __rte_experimental +rte_vhost_crypto_finalize_requests(struct rte_crypto_op **ops, + uint16_t nb_ops, int *callfds, uint16_t *nb_callfds); + +#endif /**< _VHOST_CRYPTO_H_ */ diff --git a/src/spdk/dpdk/lib/librte_vhost/rte_vhost_version.map b/src/spdk/dpdk/lib/librte_vhost/rte_vhost_version.map new file mode 100644 index 00000000..da220dd0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/rte_vhost_version.map @@ -0,0 +1,85 @@ +DPDK_2.0 { + global: + + rte_vhost_dequeue_burst; + rte_vhost_driver_callback_register; + rte_vhost_driver_register; + rte_vhost_enable_guest_notification; + rte_vhost_enqueue_burst; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_vhost_driver_unregister; + +} DPDK_2.0; + +DPDK_16.07 { + global: + + rte_vhost_avail_entries; + rte_vhost_get_ifname; + rte_vhost_get_numa_node; + rte_vhost_get_queue_num; + +} DPDK_2.1; + +DPDK_17.05 { + global: + + rte_vhost_driver_disable_features; + rte_vhost_driver_enable_features; + rte_vhost_driver_get_features; + rte_vhost_driver_set_features; + rte_vhost_driver_start; + rte_vhost_get_mem_table; + rte_vhost_get_mtu; + rte_vhost_get_negotiated_features; + rte_vhost_get_vhost_vring; + rte_vhost_get_vring_num; + rte_vhost_gpa_to_vva; + rte_vhost_log_used_vring; + rte_vhost_log_write; + +} DPDK_16.07; + +DPDK_17.08 { + global: + + rte_vhost_rx_queue_count; + +} DPDK_17.05; + +DPDK_18.02 { + global: + + rte_vhost_vring_call; + +} DPDK_17.08; + +EXPERIMENTAL { + global: + + rte_vdpa_register_device; + rte_vdpa_unregister_device; + rte_vdpa_find_device_id; + rte_vdpa_get_device; + rte_vhost_driver_attach_vdpa_device; + rte_vhost_driver_detach_vdpa_device; + rte_vhost_driver_get_vdpa_device_id; + rte_vhost_get_vdpa_device_id; + rte_vhost_driver_get_protocol_features; + rte_vhost_driver_get_queue_num; + rte_vhost_get_log_base; + rte_vhost_get_vring_base; + rte_vhost_set_vring_base; + rte_vhost_crypto_create; + rte_vhost_crypto_free; + rte_vhost_crypto_fetch_requests; + rte_vhost_crypto_finalize_requests; + rte_vhost_crypto_set_zero_copy; + rte_vhost_va_from_guest_pa; +}; diff --git a/src/spdk/dpdk/lib/librte_vhost/socket.c b/src/spdk/dpdk/lib/librte_vhost/socket.c new file mode 100644 index 00000000..d6303174 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/socket.c @@ -0,0 +1,1063 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; + char *path; + int socket_fd; + struct sockaddr_un un; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; + bool iommu_support; + bool use_builtin_virtio_net; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + /* + * Device id to identify a specific backend device. + * It's set to -1 for the default software implementation. + * If valid, one socket can have 1 connection only. + */ + int vdpa_dev_id; + + struct vhost_device_ops const *notify_ops; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int connfd; + int vid; + + TAILQ_ENTRY(vhost_user_connection) next; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int got_fds = 0; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int)); + break; + } + } + + /* Clear out unused file descriptors */ + while (got_fds < fd_num) + fds[got_fds++] = -1; + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + if (cmsg == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); + errno = EINVAL; + return -1; + } + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + if (vsocket == NULL) + return; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(); + if (vid == -1) { + goto err; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net); + + vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + if (vsocket->notify_ops->new_connection) { + ret = vsocket->notify_ops->new_connection(vid); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add vhost user connection with fd %d\n", + fd); + goto err; + } + } + + conn->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + goto err; + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + + fdset_pipe_notify(&vhost_user.fdset); + return; + +err: + free(conn); + close(fd); +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + close(connfd); + *remove = 1; + vhost_destroy_device(conn->vid); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(conn); + + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } + } +} + +static int +create_unix_socket(struct vhost_user_socket *vsocket) +{ + int fd; + struct sockaddr_un *un = &vsocket->un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + vsocket->is_server ? "server" : "client", fd); + + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + vsocket->socket_fd = fd; + return 0; +} + +static int +vhost_user_start_server(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + + /* + * bind () may fail if the socket file with the same name already + * exists. But the library obviously should not delete the file + * provided by the user, since we can not be sure that it is not + * being used by other applications. Moreover, many applications form + * socket names based on user input, which is prone to errors. + * + * The user must ensure that the socket does not exist before + * registering the vhost driver in server mode. + */ + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + ret = pthread_mutex_init(&reconn_list.mutex, NULL); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "failed to initialize mutex"); + return ret; + } + TAILQ_INIT(&reconn_list.head); + + ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL, + vhost_user_client_reconnect, NULL); + if (ret != 0) { + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + if (pthread_mutex_destroy(&reconn_list.mutex)) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to destroy reconnect mutex"); + } + } + + return ret; +} + +static int +vhost_user_start_client(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = vsocket->un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_attach_vdpa_device(const char *path, int did) +{ + struct vhost_user_socket *vsocket; + + if (rte_vdpa_get_device(did) == NULL) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->vdpa_dev_id = did; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_detach_vdpa_device(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->vdpa_dev_id = -1; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_vdpa_device_id(const char *path) +{ + struct vhost_user_socket *vsocket; + int did = -1; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + did = vsocket->vdpa_dev_id; + pthread_mutex_unlock(&vhost_user.mutex); + + return did; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + + /* Note that use_builtin_virtio_net is not affected by this function + * since callers may want to selectively disable features of the + * built-in vhost net device backend. + */ + + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + + /* Anyone setting feature bits is implementing their own vhost + * device backend. + */ + vsocket->use_builtin_virtio_net = false; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + uint64_t vdpa_features; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret = 0; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + ret = -1; + goto unlock_exit; + } + + did = vsocket->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev || !vdpa_dev->ops->get_features) { + *features = vsocket->features; + goto unlock_exit; + } + + if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get vdpa features " + "for socket file %s.\n", path); + ret = -1; + goto unlock_exit; + } + + *features = vsocket->features & vdpa_features; + +unlock_exit: + pthread_mutex_unlock(&vhost_user.mutex); + return ret; +} + +int +rte_vhost_driver_get_protocol_features(const char *path, + uint64_t *protocol_features) +{ + struct vhost_user_socket *vsocket; + uint64_t vdpa_protocol_features; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret = 0; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + ret = -1; + goto unlock_exit; + } + + did = vsocket->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) { + *protocol_features = VHOST_USER_PROTOCOL_FEATURES; + goto unlock_exit; + } + + if (vdpa_dev->ops->get_protocol_features(did, + &vdpa_protocol_features) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get vdpa protocol features " + "for socket file %s.\n", path); + ret = -1; + goto unlock_exit; + } + + *protocol_features = VHOST_USER_PROTOCOL_FEATURES + & vdpa_protocol_features; + +unlock_exit: + pthread_mutex_unlock(&vhost_user.mutex); + return ret; +} + +int +rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num) +{ + struct vhost_user_socket *vsocket; + uint32_t vdpa_queue_num; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret = 0; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + ret = -1; + goto unlock_exit; + } + + did = vsocket->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) { + *queue_num = VHOST_MAX_QUEUE_PAIRS; + goto unlock_exit; + } + + if (vdpa_dev->ops->get_queue_num(did, &vdpa_queue_num) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get vdpa queue number " + "for socket file %s.\n", path); + ret = -1; + goto unlock_exit; + } + + *queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num); + +unlock_exit: + pthread_mutex_unlock(&vhost_user.mutex); + return ret; +} + +static void +vhost_user_socket_mem_free(struct vhost_user_socket *vsocket) +{ + if (vsocket && vsocket->path) { + free(vsocket->path); + vsocket->path = NULL; + } + + if (vsocket) { + free(vsocket); + vsocket = NULL; + } +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + if (vsocket->path == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: failed to copy socket path string\n"); + vhost_user_socket_mem_free(vsocket); + goto out; + } + TAILQ_INIT(&vsocket->conn_list); + ret = pthread_mutex_init(&vsocket->conn_mutex, NULL); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: failed to init connection mutex\n"); + goto out_free; + } + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->use_builtin_virtio_net = true; + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + + /* Dequeue zero copy can't assure descriptors returned in order */ + if (vsocket->dequeue_zero_copy) { + vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER); + vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER); + } + + if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) { + vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); + vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); + } + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() != 0) + goto out_mutex; + } + } else { + vsocket->is_server = true; + } + ret = create_unix_socket(vsocket); + if (ret < 0) { + goto out_mutex; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + + pthread_mutex_unlock(&vhost_user.mutex); + return ret; + +out_mutex: + if (pthread_mutex_destroy(&vsocket->conn_mutex)) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: failed to destroy connection mutex\n"); + } +out_free: + vhost_user_socket_mem_free(vsocket); +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn, *next; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { +again: + pthread_mutex_lock(&vsocket->conn_mutex); + for (conn = TAILQ_FIRST(&vsocket->conn_list); + conn != NULL; + conn = next) { + next = TAILQ_NEXT(conn, next); + + /* + * If r/wcb is executing, release the + * conn_mutex lock, and try again since + * the r/wcb may use the conn_mutex lock. + */ + if (fdset_try_del(&vhost_user.fdset, + conn->connfd) == -1) { + pthread_mutex_unlock( + &vsocket->conn_mutex); + goto again; + } + + RTE_LOG(INFO, VHOST_CONFIG, + "free connfd = %d for device '%s'\n", + conn->connfd, path); + close(conn->connfd); + vhost_destroy_device(conn->vid); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + free(conn); + } + pthread_mutex_unlock(&vsocket->conn_mutex); + + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, + vsocket->socket_fd); + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + pthread_mutex_destroy(&vsocket->conn_mutex); + vhost_user_socket_mem_free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + +int +rte_vhost_driver_start(const char *path) +{ + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + /** + * create a pipe which will be waited by poll and notified to + * rebuild the wait list of poll. + */ + if (fdset_pipe_init(&vhost_user.fdset) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create pipe for vhost fdset\n"); + return -1; + } + + int ret = rte_ctrl_thread_create(&fdset_tid, + "vhost-events", NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret != 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + + fdset_pipe_uninit(&vhost_user.fdset); + return -1; + } + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); +} diff --git a/src/spdk/dpdk/lib/librte_vhost/vdpa.c b/src/spdk/dpdk/lib/librte_vhost/vdpa.c new file mode 100644 index 00000000..c82fd437 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/vdpa.c @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +/** + * @file + * + * Device specific vhost lib + */ + +#include + +#include +#include "rte_vdpa.h" +#include "vhost.h" + +static struct rte_vdpa_device *vdpa_devices[MAX_VHOST_DEVICE]; +static uint32_t vdpa_device_num; + +static bool +is_same_vdpa_device(struct rte_vdpa_dev_addr *a, + struct rte_vdpa_dev_addr *b) +{ + bool ret = true; + + if (a->type != b->type) + return false; + + switch (a->type) { + case PCI_ADDR: + if (a->pci_addr.domain != b->pci_addr.domain || + a->pci_addr.bus != b->pci_addr.bus || + a->pci_addr.devid != b->pci_addr.devid || + a->pci_addr.function != b->pci_addr.function) + ret = false; + break; + default: + break; + } + + return ret; +} + +int +rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr, + struct rte_vdpa_dev_ops *ops) +{ + struct rte_vdpa_device *dev; + char device_name[MAX_VDPA_NAME_LEN]; + int i; + + if (vdpa_device_num >= MAX_VHOST_DEVICE) + return -1; + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + dev = vdpa_devices[i]; + if (dev && is_same_vdpa_device(&dev->addr, addr)) + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vdpa_devices[i] == NULL) + break; + } + + sprintf(device_name, "vdpa-dev-%d", i); + dev = rte_zmalloc(device_name, sizeof(struct rte_vdpa_device), + RTE_CACHE_LINE_SIZE); + if (!dev) + return -1; + + memcpy(&dev->addr, addr, sizeof(struct rte_vdpa_dev_addr)); + dev->ops = ops; + vdpa_devices[i] = dev; + vdpa_device_num++; + + return i; +} + +int +rte_vdpa_unregister_device(int did) +{ + if (did < 0 || did >= MAX_VHOST_DEVICE || vdpa_devices[did] == NULL) + return -1; + + rte_free(vdpa_devices[did]); + vdpa_devices[did] = NULL; + vdpa_device_num--; + + return did; +} + +int +rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr) +{ + struct rte_vdpa_device *dev; + int i; + + for (i = 0; i < MAX_VHOST_DEVICE; ++i) { + dev = vdpa_devices[i]; + if (dev && is_same_vdpa_device(&dev->addr, addr)) + return i; + } + + return -1; +} + +struct rte_vdpa_device * +rte_vdpa_get_device(int did) +{ + if (did < 0 || did >= MAX_VHOST_DEVICE) + return NULL; + + return vdpa_devices[did]; +} diff --git a/src/spdk/dpdk/lib/librte_vhost/vhost.c b/src/spdk/dpdk/lib/librte_vhost/vhost.c new file mode 100644 index 00000000..3c9be10a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/vhost.c @@ -0,0 +1,825 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2017 Intel Corporation + */ + +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iotlb.h" +#include "vhost.h" +#include "vhost_user.h" + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Called with iotlb_lock read-locked */ +uint64_t +__vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t *size, uint8_t perm) +{ + uint64_t vva, tmp_size; + + if (unlikely(!*size)) + return 0; + + tmp_size = *size; + + vva = vhost_user_iotlb_cache_find(vq, iova, &tmp_size, perm); + if (tmp_size == *size) + return vva; + + iova += tmp_size; + + if (!vhost_user_iotlb_pending_miss(vq, iova, perm)) { + /* + * iotlb_lock is read-locked for a full burst, + * but it only protects the iotlb cache. + * In case of IOTLB miss, we might block on the socket, + * which could cause a deadlock with QEMU if an IOTLB update + * is being handled. We can safely unlock here to avoid it. + */ + vhost_user_iotlb_rd_unlock(vq); + + vhost_user_iotlb_pending_insert(vq, iova, perm); + if (vhost_user_iotlb_miss(dev, iova, perm)) { + RTE_LOG(ERR, VHOST_CONFIG, + "IOTLB miss req failed for IOVA 0x%" PRIx64 "\n", + iova); + vhost_user_iotlb_pending_remove(vq, iova, 1, perm); + } + + vhost_user_iotlb_rd_lock(vq); + } + + return 0; +} + +void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); +} + +void +free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (vq_is_packed(dev)) + rte_free(vq->shadow_used_packed); + else + rte_free(vq->shadow_used_split); + rte_free(vq->batch_copy_elems); + rte_mempool_free(vq->iotlb_pool); + rte_free(vq); +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + + for (i = 0; i < dev->nr_vring; i++) + free_vq(dev, dev->virtqueue[i]); + + rte_free(dev); +} + +static int +vring_translate_split(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint64_t req_size, size; + + req_size = sizeof(struct vring_desc) * vq->size; + size = req_size; + vq->desc = (struct vring_desc *)(uintptr_t)vhost_iova_to_vva(dev, vq, + vq->ring_addrs.desc_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->desc || size != req_size) + return -1; + + req_size = sizeof(struct vring_avail); + req_size += sizeof(uint16_t) * vq->size; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) + req_size += sizeof(uint16_t); + size = req_size; + vq->avail = (struct vring_avail *)(uintptr_t)vhost_iova_to_vva(dev, vq, + vq->ring_addrs.avail_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->avail || size != req_size) + return -1; + + req_size = sizeof(struct vring_used); + req_size += sizeof(struct vring_used_elem) * vq->size; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) + req_size += sizeof(uint16_t); + size = req_size; + vq->used = (struct vring_used *)(uintptr_t)vhost_iova_to_vva(dev, vq, + vq->ring_addrs.used_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->used || size != req_size) + return -1; + + return 0; +} + +static int +vring_translate_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint64_t req_size, size; + + req_size = sizeof(struct vring_packed_desc) * vq->size; + size = req_size; + vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->desc_packed || size != req_size) + return -1; + + req_size = sizeof(struct vring_packed_desc_event); + size = req_size; + vq->driver_event = (struct vring_packed_desc_event *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.avail_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->driver_event || size != req_size) + return -1; + + req_size = sizeof(struct vring_packed_desc_event); + size = req_size; + vq->device_event = (struct vring_packed_desc_event *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.used_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->device_event || size != req_size) + return -1; + + return 0; +} + +int +vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + + if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + goto out; + + if (vq_is_packed(dev)) { + if (vring_translate_packed(dev, vq) < 0) + return -1; + } else { + if (vring_translate_split(dev, vq) < 0) + return -1; + } +out: + vq->access_ok = 1; + + return 0; +} + +void +vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_wr_lock(vq); + + vq->access_ok = 0; + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_wr_unlock(vq); +} + +static void +init_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed not init vring, out of bound (%d)\n", + vring_idx); + return; + } + + vq = dev->virtqueue[vring_idx]; + + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + vhost_user_iotlb_init(dev, vring_idx); + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +reset_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + int callfd; + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed not init vring, out of bound (%d)\n", + vring_idx); + return; + } + + vq = dev->virtqueue[vring_idx]; + callfd = vq->callfd; + init_vring_queue(dev, vring_idx); + vq->callfd = callfd; +} + +int +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for vring:%u.\n", vring_idx); + return -1; + } + + dev->virtqueue[vring_idx] = vq; + init_vring_queue(dev, vring_idx); + rte_spinlock_init(&vq->access_lock); + vq->avail_wrap_counter = 1; + vq->used_wrap_counter = 1; + vq->signalled_used_valid = false; + + dev->nr_vring += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, nr_vring: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->features = 0; + dev->protocol_features = 0; + dev->flags &= VIRTIO_DEV_BUILTIN_VIRTIO_NET; + + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev, i); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(void) +{ + struct virtio_net *dev; + int i; + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + return -1; + } + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET; + dev->slave_req_fd = -1; + dev->vdpa_dev_id = -1; + rte_spinlock_init(&dev->slave_req_lock); + + return i; +} + +void +vhost_destroy_device_notify(struct virtio_net *dev) +{ + struct rte_vdpa_device *vdpa_dev; + int did; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->dev_close) + vdpa_dev->ops->dev_close(dev->vid); + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_destroy_device_notify(dev); + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_attach_vdpa_device(int vid, int did) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (rte_vdpa_get_device(did) == NULL) + return; + + dev->vdpa_dev_id = did; +} + +void +vhost_detach_vdpa_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_user_host_notifier_ctrl(vid, false); + + dev->vdpa_dev_id = -1; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +void +vhost_set_builtin_virtio_net(int vid, bool enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (enable) + dev->flags |= VIRTIO_DEV_BUILTIN_VIRTIO_NET; + else + dev->flags &= ~VIRTIO_DEV_BUILTIN_VIRTIO_NET; +} + +int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->features & (1ULL << VIRTIO_NET_F_MTU))) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %s\n", + vid, rte_strerror(errno)); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +uint32_t +rte_vhost_get_queue_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->nr_vring / 2; +} + +uint16_t +rte_vhost_get_vring_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->nr_vring; +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(sizeof(struct rte_vhost_memory) + size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + return 0; +} + +int +rte_vhost_vring_call(int vid, uint16_t vring_idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + if (vq_is_packed(dev)) + vhost_vring_call_packed(dev, vq); + else + vhost_vring_call_split(dev, vq); + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +static inline void +vhost_enable_notify_split(struct vhost_virtqueue *vq, int enable) +{ + if (enable) + vq->used->flags &= ~VRING_USED_F_NO_NOTIFY; + else + vq->used->flags |= VRING_USED_F_NO_NOTIFY; +} + +static inline void +vhost_enable_notify_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, int enable) +{ + uint16_t flags; + + if (!enable) + vq->device_event->flags = VRING_EVENT_F_DISABLE; + + flags = VRING_EVENT_F_ENABLE; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + flags = VRING_EVENT_F_DESC; + vq->device_event->off_wrap = vq->last_avail_idx | + vq->avail_wrap_counter << 15; + } + + rte_smp_wmb(); + + vq->device_event->flags = flags; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_virtqueue *vq; + + if (!dev) + return -1; + + vq = dev->virtqueue[queue_id]; + + if (vq_is_packed(dev)) + vhost_enable_notify_packed(dev, vq, enable); + else + vhost_enable_notify_split(vq, enable); + + return 0; +} + +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_log_write(dev, addr, len); +} + +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); +} + +uint32_t +rte_vhost_rx_queue_count(int vid, uint16_t qid) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return 0; + + if (unlikely(qid >= dev->nr_vring || (qid & 1) == 0)) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, qid); + return 0; + } + + vq = dev->virtqueue[qid]; + if (vq == NULL) + return 0; + + if (unlikely(vq->enabled == 0 || vq->avail == NULL)) + return 0; + + return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx; +} + +int rte_vhost_get_vdpa_device_id(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + return dev->vdpa_dev_id; +} + +int rte_vhost_get_log_base(int vid, uint64_t *log_base, + uint64_t *log_size) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return -1; + } + + *log_base = dev->log_base; + *log_size = dev->log_size; + + return 0; +} + +int rte_vhost_get_vring_base(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return -1; + } + + *last_avail_idx = dev->virtqueue[queue_id]->last_avail_idx; + *last_used_idx = dev->virtqueue[queue_id]->last_used_idx; + + return 0; +} + +int rte_vhost_set_vring_base(int vid, uint16_t queue_id, + uint16_t last_avail_idx, uint16_t last_used_idx) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return -1; + } + + dev->virtqueue[queue_id]->last_avail_idx = last_avail_idx; + dev->virtqueue[queue_id]->last_used_idx = last_used_idx; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_vhost/vhost.h b/src/spdk/dpdk/lib/librte_vhost/vhost.h new file mode 100644 index 00000000..760a09c0 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/vhost.h @@ -0,0 +1,745 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "rte_vhost.h" +#include "rte_vdpa.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 +/* Used to indicate that the built-in vhost net device backend is enabled */ +#define VIRTIO_DEV_BUILTIN_VIRTIO_NET 4 +/* Used to indicate that the device has its own data path and configured */ +#define VIRTIO_DEV_VDPA_CONFIGURED 8 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +#define VHOST_LOG_CACHE_NR 32 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_iova; + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t desc_count; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/* + * Structure contains the info for each batched memory copy. + */ +struct batch_copy_elem { + void *dst; + void *src; + uint32_t len; + uint64_t log_addr; +}; + +/* + * Structure that contains the info for batched dirty logging. + */ +struct log_cache_entry { + uint32_t offset; + unsigned long val; +}; + +struct vring_used_elem_packed { + uint16_t id; + uint32_t len; + uint32_t count; +}; + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + union { + struct vring_desc *desc; + struct vring_packed_desc *desc_packed; + }; + union { + struct vring_avail *avail; + struct vring_packed_desc_event *driver_event; + }; + union { + struct vring_used *used; + struct vring_packed_desc_event *device_event; + }; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; + /* Last used index we notify to front end. */ + uint16_t signalled_used; + bool signalled_used_valid; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + int enabled; + int access_ok; + rte_spinlock_t access_lock; + + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + union { + struct vring_used_elem *shadow_used_split; + struct vring_used_elem_packed *shadow_used_packed; + }; + uint16_t shadow_used_idx; + struct vhost_vring_addr ring_addrs; + + struct batch_copy_elem *batch_copy_elems; + uint16_t batch_copy_nb_elems; + bool used_wrap_counter; + bool avail_wrap_counter; + + struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR]; + uint16_t log_cache_nb_elem; + + rte_rwlock_t iotlb_lock; + rte_rwlock_t iotlb_pending_lock; + struct rte_mempool *iotlb_pool; + TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list; + int iotlb_cache_nr; + TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list; +} __rte_cache_aligned; + +/* Old kernels have no such macros defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif + +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 +#endif + +#ifndef VIRTIO_F_ANY_LAYOUT + #define VIRTIO_F_ANY_LAYOUT 27 +#endif + +/* Declare IOMMU related bits for older kernels */ +#ifndef VIRTIO_F_IOMMU_PLATFORM + +#define VIRTIO_F_IOMMU_PLATFORM 33 + +struct vhost_iotlb_msg { + __u64 iova; + __u64 size; + __u64 uaddr; +#define VHOST_ACCESS_RO 0x1 +#define VHOST_ACCESS_WO 0x2 +#define VHOST_ACCESS_RW 0x3 + __u8 perm; +#define VHOST_IOTLB_MISS 1 +#define VHOST_IOTLB_UPDATE 2 +#define VHOST_IOTLB_INVALIDATE 3 +#define VHOST_IOTLB_ACCESS_FAIL 4 + __u8 type; +}; + +#define VHOST_IOTLB_MSG 0x1 + +struct vhost_msg { + int type; + union { + struct vhost_iotlb_msg iotlb; + __u8 padding[64]; + }; +}; +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +/* Declare packed ring related bits for older kernels */ +#ifndef VIRTIO_F_RING_PACKED + +#define VIRTIO_F_RING_PACKED 34 + +#define VRING_DESC_F_NEXT 1 +#define VRING_DESC_F_WRITE 2 +#define VRING_DESC_F_INDIRECT 4 + +#define VRING_DESC_F_AVAIL (1ULL << 7) +#define VRING_DESC_F_USED (1ULL << 15) + +struct vring_packed_desc { + uint64_t addr; + uint32_t len; + uint16_t id; + uint16_t flags; +}; + +#define VRING_EVENT_F_ENABLE 0x0 +#define VRING_EVENT_F_DISABLE 0x1 +#define VRING_EVENT_F_DESC 0x2 + +struct vring_packed_desc_event { + uint16_t off_wrap; + uint16_t flags; +}; +#endif + +/* + * Available and used descs are in same order + */ +#ifndef VIRTIO_F_IN_ORDER +#define VIRTIO_F_IN_ORDER 35 +#endif + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_F_ANY_LAYOUT) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_GSO) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_HOST_UFO) | \ + (1ULL << VIRTIO_NET_F_HOST_ECN) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_NET_F_GUEST_UFO) | \ + (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_F_IN_ORDER) | \ + (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/** + * function prototype for the vhost backend to handler specific vhost user + * messages prior to the master message handling + * + * @param vid + * vhost device id + * @param msg + * Message pointer. + * @param require_reply + * If the handler requires sending a reply, this varaible shall be written 1, + * otherwise 0. + * @param skip_master + * If the handler requires skipping the master message handling, this variable + * shall be written 1, otherwise 0. + * @return + * 0 on success, -1 on failure + */ +typedef int (*vhost_msg_pre_handle)(int vid, void *msg, + uint32_t *require_reply, uint32_t *skip_master); + +/** + * function prototype for the vhost backend to handler specific vhost user + * messages after the master message handling is done + * + * @param vid + * vhost device id + * @param msg + * Message pointer. + * @param require_reply + * If the handler requires sending a reply, this varaible shall be written 1, + * otherwise 0. + * @return + * 0 on success, -1 on failure + */ +typedef int (*vhost_msg_post_handle)(int vid, void *msg, + uint32_t *require_reply); + +/** + * pre and post vhost user message handlers + */ +struct vhost_user_extern_ops { + vhost_msg_pre_handle pre_msg_handle; + vhost_msg_post_handle post_msg_handle; +}; + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct rte_vhost_memory *mem; + uint64_t features; + uint64_t protocol_features; + int vid; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t nr_vring; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; + + int slave_req_fd; + rte_spinlock_t slave_req_lock; + + /* + * Device id to identify a specific backend device. + * It's set to -1 for the default software implementation. + */ + int vdpa_dev_id; + + /* private data for virtio device */ + void *extern_data; + /* pre and post vhost user message handlers for the device */ + struct vhost_user_extern_ops extern_ops; +} __rte_cache_aligned; + +static __rte_always_inline bool +vq_is_packed(struct virtio_net *dev) +{ + return dev->features & (1ull << VIRTIO_F_RING_PACKED); +} + +static inline bool +desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) +{ + return wrap_counter == !!(desc->flags & VRING_DESC_F_AVAIL) && + wrap_counter != !!(desc->flags & VRING_DESC_F_USED); +} + +#define VHOST_LOG_PAGE 4096 + +/* + * Atomically set a bit in memory. + */ +static __rte_always_inline void +vhost_set_bit(unsigned int nr, volatile uint8_t *addr) +{ +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) + /* + * __sync_ built-ins are deprecated, but __atomic_ ones + * are sub-optimized in older GCC versions. + */ + __sync_fetch_and_or_1(addr, (1U << nr)); +#else + __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED); +#endif +} + +static __rte_always_inline void +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + vhost_set_bit(page % 8, &log_base[page / 8]); +} + +static __rte_always_inline void +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static __rte_always_inline void +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + unsigned long *log_base; + int i; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base)) + return; + + log_base = (unsigned long *)(uintptr_t)dev->log_base; + + /* + * It is expected a write memory barrier has been issued + * before this function is called. + */ + + for (i = 0; i < vq->log_cache_nb_elem; i++) { + struct log_cache_entry *elem = vq->log_cache + i; + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) + /* + * '__sync' builtins are deprecated, but '__atomic' ones + * are sub-optimized in older GCC versions. + */ + __sync_fetch_and_or(log_base + elem->offset, elem->val); +#else + __atomic_fetch_or(log_base + elem->offset, elem->val, + __ATOMIC_RELAXED); +#endif + } + + rte_smp_wmb(); + + vq->log_cache_nb_elem = 0; +} + +static __rte_always_inline void +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t page) +{ + uint32_t bit_nr = page % (sizeof(unsigned long) << 3); + uint32_t offset = page / (sizeof(unsigned long) << 3); + int i; + + for (i = 0; i < vq->log_cache_nb_elem; i++) { + struct log_cache_entry *elem = vq->log_cache + i; + + if (elem->offset == offset) { + elem->val |= (1UL << bit_nr); + return; + } + } + + if (unlikely(i >= VHOST_LOG_CACHE_NR)) { + /* + * No more room for a new log cache entry, + * so write the dirty log map directly. + */ + rte_smp_wmb(); + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + + return; + } + + vq->log_cache[i].offset = offset; + vq->log_cache[i].val = (1UL << bit_nr); + vq->log_cache_nb_elem++; +} + +static __rte_always_inline void +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_cache_page(dev, vq, page); + page += 1; + } +} + +static __rte_always_inline void +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len); +} + +static __rte_always_inline void +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define VHOST_LOG_DEBUG(log_type, fmt, args...) \ + RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical address to host physical address */ +static __rte_always_inline rte_iova_t +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +static __rte_always_inline struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +int vhost_new_device(void); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); +void vhost_destroy_device_notify(struct virtio_net *dev); + +void cleanup_vq(struct vhost_virtqueue *vq, int destroy); +void free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq); + +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); + +void vhost_attach_vdpa_device(int vid, int did); +void vhost_detach_vdpa_device(int vid); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); +void vhost_set_builtin_virtio_net(int vid, bool enable); + +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t *len, uint8_t perm); +int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq); +void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq); + +static __rte_always_inline uint64_t +vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t iova, uint64_t *len, uint8_t perm) +{ + if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + return rte_vhost_va_from_guest_pa(dev->mem, iova, len); + + return __vhost_iova_to_vva(dev, vq, iova, len, perm); +} + +#define vhost_used_event(vr) \ + (*(volatile uint16_t*)&(vr)->avail->ring[(vr)->size]) + +/* + * The following is used with VIRTIO_RING_F_EVENT_IDX. + * Assuming a given event_idx value from the other size, if we have + * just incremented index from old to new_idx, should we trigger an + * event? + */ +static __rte_always_inline int +vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) +{ + return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); +} + +static __rte_always_inline void +vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + /* Flush used->idx update before we read avail->flags. */ + rte_smp_mb(); + + /* Don't kick guest if we don't reach index specified by guest. */ + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + uint16_t old = vq->signalled_used; + uint16_t new = vq->last_used_idx; + + VHOST_LOG_DEBUG(VHOST_DATA, "%s: used_event_idx=%d, old=%d, new=%d\n", + __func__, + vhost_used_event(vq), + old, new); + if (vhost_need_event(vhost_used_event(vq), new, old) + && (vq->callfd >= 0)) { + vq->signalled_used = vq->last_used_idx; + eventfd_write(vq->callfd, (eventfd_t) 1); + } + } else { + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + } +} + +static __rte_always_inline void +vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t old, new, off, off_wrap; + bool signalled_used_valid, kick = false; + + /* Flush used desc update. */ + rte_smp_mb(); + + if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) { + if (vq->driver_event->flags != + VRING_EVENT_F_DISABLE) + kick = true; + goto kick; + } + + old = vq->signalled_used; + new = vq->last_used_idx; + vq->signalled_used = new; + signalled_used_valid = vq->signalled_used_valid; + vq->signalled_used_valid = true; + + if (vq->driver_event->flags != VRING_EVENT_F_DESC) { + if (vq->driver_event->flags != VRING_EVENT_F_DISABLE) + kick = true; + goto kick; + } + + if (unlikely(!signalled_used_valid)) { + kick = true; + goto kick; + } + + rte_smp_rmb(); + + off_wrap = vq->driver_event->off_wrap; + off = off_wrap & ~(1 << 15); + + if (new <= old) + old -= vq->size; + + if (vq->used_wrap_counter != off_wrap >> 15) + off -= vq->size; + + if (vhost_need_event(off, new, old)) + kick = true; +kick: + if (kick) + eventfd_write(vq->callfd, (eventfd_t)1); +} + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/src/spdk/dpdk/lib/librte_vhost/vhost_crypto.c b/src/spdk/dpdk/lib/librte_vhost/vhost_crypto.c new file mode 100644 index 00000000..57341ef8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/vhost_crypto.c @@ -0,0 +1,1372 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ +#include +#include +#include +#include +#include + +#include "rte_vhost_crypto.h" +#include "vhost.h" +#include "vhost_user.h" +#include "virtio_crypto.h" + +#define INHDR_LEN (sizeof(struct virtio_crypto_inhdr)) +#define IV_OFFSET (sizeof(struct rte_crypto_op) + \ + sizeof(struct rte_crypto_sym_op)) + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VC_LOG_ERR(fmt, args...) \ + RTE_LOG(ERR, USER1, "[%s] %s() line %u: " fmt "\n", \ + "Vhost-Crypto", __func__, __LINE__, ## args) +#define VC_LOG_INFO(fmt, args...) \ + RTE_LOG(INFO, USER1, "[%s] %s() line %u: " fmt "\n", \ + "Vhost-Crypto", __func__, __LINE__, ## args) + +#define VC_LOG_DBG(fmt, args...) \ + RTE_LOG(DEBUG, USER1, "[%s] %s() line %u: " fmt "\n", \ + "Vhost-Crypto", __func__, __LINE__, ## args) +#else +#define VC_LOG_ERR(fmt, args...) \ + RTE_LOG(ERR, USER1, "[VHOST-Crypto]: " fmt "\n", ## args) +#define VC_LOG_INFO(fmt, args...) \ + RTE_LOG(INFO, USER1, "[VHOST-Crypto]: " fmt "\n", ## args) +#define VC_LOG_DBG(fmt, args...) +#endif + +#define VIRTIO_CRYPTO_FEATURES ((1 << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1 << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1 << VIRTIO_RING_F_EVENT_IDX) | \ + (1 << VIRTIO_CRYPTO_SERVICE_CIPHER) | \ + (1 << VIRTIO_CRYPTO_SERVICE_MAC) | \ + (1 << VIRTIO_NET_F_CTRL_VQ)) + +#define IOVA_TO_VVA(t, r, a, l, p) \ + ((t)(uintptr_t)vhost_iova_to_vva(r->dev, r->vq, a, l, p)) + +static int +cipher_algo_transform(uint32_t virtio_cipher_algo) +{ + int ret; + + switch (virtio_cipher_algo) { + case VIRTIO_CRYPTO_CIPHER_AES_CBC: + ret = RTE_CRYPTO_CIPHER_AES_CBC; + break; + case VIRTIO_CRYPTO_CIPHER_AES_CTR: + ret = RTE_CRYPTO_CIPHER_AES_CTR; + break; + case VIRTIO_CRYPTO_CIPHER_DES_ECB: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_CIPHER_DES_CBC: + ret = RTE_CRYPTO_CIPHER_DES_CBC; + break; + case VIRTIO_CRYPTO_CIPHER_3DES_ECB: + ret = RTE_CRYPTO_CIPHER_3DES_ECB; + break; + case VIRTIO_CRYPTO_CIPHER_3DES_CBC: + ret = RTE_CRYPTO_CIPHER_3DES_CBC; + break; + case VIRTIO_CRYPTO_CIPHER_3DES_CTR: + ret = RTE_CRYPTO_CIPHER_3DES_CTR; + break; + case VIRTIO_CRYPTO_CIPHER_KASUMI_F8: + ret = RTE_CRYPTO_CIPHER_KASUMI_F8; + break; + case VIRTIO_CRYPTO_CIPHER_SNOW3G_UEA2: + ret = RTE_CRYPTO_CIPHER_SNOW3G_UEA2; + break; + case VIRTIO_CRYPTO_CIPHER_AES_F8: + ret = RTE_CRYPTO_CIPHER_AES_F8; + break; + case VIRTIO_CRYPTO_CIPHER_AES_XTS: + ret = RTE_CRYPTO_CIPHER_AES_XTS; + break; + case VIRTIO_CRYPTO_CIPHER_ZUC_EEA3: + ret = RTE_CRYPTO_CIPHER_ZUC_EEA3; + break; + default: + ret = -VIRTIO_CRYPTO_BADMSG; + break; + } + + return ret; +} + +static int +auth_algo_transform(uint32_t virtio_auth_algo) +{ + int ret; + + switch (virtio_auth_algo) { + + case VIRTIO_CRYPTO_NO_MAC: + ret = RTE_CRYPTO_AUTH_NULL; + break; + case VIRTIO_CRYPTO_MAC_HMAC_MD5: + ret = RTE_CRYPTO_AUTH_MD5_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA1: + ret = RTE_CRYPTO_AUTH_SHA1_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_224: + ret = RTE_CRYPTO_AUTH_SHA224_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_256: + ret = RTE_CRYPTO_AUTH_SHA256_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_384: + ret = RTE_CRYPTO_AUTH_SHA384_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_512: + ret = RTE_CRYPTO_AUTH_SHA512_HMAC; + break; + case VIRTIO_CRYPTO_MAC_CMAC_3DES: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_MAC_CMAC_AES: + ret = RTE_CRYPTO_AUTH_AES_CMAC; + break; + case VIRTIO_CRYPTO_MAC_KASUMI_F9: + ret = RTE_CRYPTO_AUTH_KASUMI_F9; + break; + case VIRTIO_CRYPTO_MAC_SNOW3G_UIA2: + ret = RTE_CRYPTO_AUTH_SNOW3G_UIA2; + break; + case VIRTIO_CRYPTO_MAC_GMAC_AES: + ret = RTE_CRYPTO_AUTH_AES_GMAC; + break; + case VIRTIO_CRYPTO_MAC_GMAC_TWOFISH: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_MAC_CBCMAC_AES: + ret = RTE_CRYPTO_AUTH_AES_CBC_MAC; + break; + case VIRTIO_CRYPTO_MAC_CBCMAC_KASUMI_F9: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_MAC_XCBC_AES: + ret = RTE_CRYPTO_AUTH_AES_XCBC_MAC; + break; + default: + ret = -VIRTIO_CRYPTO_BADMSG; + break; + } + + return ret; +} + +static int get_iv_len(enum rte_crypto_cipher_algorithm algo) +{ + int len; + + switch (algo) { + case RTE_CRYPTO_CIPHER_3DES_CBC: + len = 8; + break; + case RTE_CRYPTO_CIPHER_3DES_CTR: + len = 8; + break; + case RTE_CRYPTO_CIPHER_3DES_ECB: + len = 8; + break; + case RTE_CRYPTO_CIPHER_AES_CBC: + len = 16; + break; + + /* TODO: add common algos */ + + default: + len = -1; + break; + } + + return len; +} + +/** + * vhost_crypto struct is used to maintain a number of virtio_cryptos and + * one DPDK crypto device that deals with all crypto workloads. It is declared + * here and defined in vhost_crypto.c + */ +struct vhost_crypto { + /** Used to lookup DPDK Cryptodev Session based on VIRTIO crypto + * session ID. + */ + struct rte_hash *session_map; + struct rte_mempool *mbuf_pool; + struct rte_mempool *sess_pool; + + /** DPDK cryptodev ID */ + uint8_t cid; + uint16_t nb_qps; + + uint64_t last_session_id; + + uint64_t cache_session_id; + struct rte_cryptodev_sym_session *cache_session; + /** socket id for the device */ + int socket_id; + + struct virtio_net *dev; + + uint8_t option; +} __rte_cache_aligned; + +struct vhost_crypto_data_req { + struct vring_desc *head; + struct virtio_net *dev; + struct virtio_crypto_inhdr *inhdr; + struct vhost_virtqueue *vq; + struct vring_desc *wb_desc; + uint16_t wb_len; + uint16_t desc_idx; + uint16_t len; + uint16_t zero_copy; +}; + +static int +transform_cipher_param(struct rte_crypto_sym_xform *xform, + VhostUserCryptoSessionParam *param) +{ + int ret; + + ret = cipher_algo_transform(param->cipher_algo); + if (unlikely(ret < 0)) + return ret; + + xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + xform->cipher.algo = (uint32_t)ret; + xform->cipher.key.length = param->cipher_key_len; + if (xform->cipher.key.length > 0) + xform->cipher.key.data = param->cipher_key_buf; + if (param->dir == VIRTIO_CRYPTO_OP_ENCRYPT) + xform->cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + else if (param->dir == VIRTIO_CRYPTO_OP_DECRYPT) + xform->cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + else { + VC_LOG_DBG("Bad operation type"); + return -VIRTIO_CRYPTO_BADMSG; + } + + ret = get_iv_len(xform->cipher.algo); + if (unlikely(ret < 0)) + return ret; + xform->cipher.iv.length = (uint16_t)ret; + xform->cipher.iv.offset = IV_OFFSET; + return 0; +} + +static int +transform_chain_param(struct rte_crypto_sym_xform *xforms, + VhostUserCryptoSessionParam *param) +{ + struct rte_crypto_sym_xform *xform_cipher, *xform_auth; + int ret; + + switch (param->chaining_dir) { + case VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER: + xform_auth = xforms; + xform_cipher = xforms->next; + xform_cipher->cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + xform_auth->auth.op = RTE_CRYPTO_AUTH_OP_VERIFY; + break; + case VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH: + xform_cipher = xforms; + xform_auth = xforms->next; + xform_cipher->cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + xform_auth->auth.op = RTE_CRYPTO_AUTH_OP_GENERATE; + break; + default: + return -VIRTIO_CRYPTO_BADMSG; + } + + /* cipher */ + ret = cipher_algo_transform(param->cipher_algo); + if (unlikely(ret < 0)) + return ret; + xform_cipher->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + xform_cipher->cipher.algo = (uint32_t)ret; + xform_cipher->cipher.key.length = param->cipher_key_len; + xform_cipher->cipher.key.data = param->cipher_key_buf; + ret = get_iv_len(xform_cipher->cipher.algo); + if (unlikely(ret < 0)) + return ret; + xform_cipher->cipher.iv.length = (uint16_t)ret; + xform_cipher->cipher.iv.offset = IV_OFFSET; + + /* auth */ + xform_auth->type = RTE_CRYPTO_SYM_XFORM_AUTH; + ret = auth_algo_transform(param->hash_algo); + if (unlikely(ret < 0)) + return ret; + xform_auth->auth.algo = (uint32_t)ret; + xform_auth->auth.digest_length = param->digest_len; + xform_auth->auth.key.length = param->auth_key_len; + xform_auth->auth.key.data = param->auth_key_buf; + + return 0; +} + +static void +vhost_crypto_create_sess(struct vhost_crypto *vcrypto, + VhostUserCryptoSessionParam *sess_param) +{ + struct rte_crypto_sym_xform xform1 = {0}, xform2 = {0}; + struct rte_cryptodev_sym_session *session; + int ret; + + switch (sess_param->op_type) { + case VIRTIO_CRYPTO_SYM_OP_NONE: + case VIRTIO_CRYPTO_SYM_OP_CIPHER: + ret = transform_cipher_param(&xform1, sess_param); + if (unlikely(ret)) { + VC_LOG_ERR("Error transform session msg (%i)", ret); + sess_param->session_id = ret; + return; + } + break; + case VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING: + if (unlikely(sess_param->hash_mode != + VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH)) { + sess_param->session_id = -VIRTIO_CRYPTO_NOTSUPP; + VC_LOG_ERR("Error transform session message (%i)", + -VIRTIO_CRYPTO_NOTSUPP); + return; + } + + xform1.next = &xform2; + + ret = transform_chain_param(&xform1, sess_param); + if (unlikely(ret)) { + VC_LOG_ERR("Error transform session message (%i)", ret); + sess_param->session_id = ret; + return; + } + + break; + default: + VC_LOG_ERR("Algorithm not yet supported"); + sess_param->session_id = -VIRTIO_CRYPTO_NOTSUPP; + return; + } + + session = rte_cryptodev_sym_session_create(vcrypto->sess_pool); + if (!session) { + VC_LOG_ERR("Failed to create session"); + sess_param->session_id = -VIRTIO_CRYPTO_ERR; + return; + } + + if (rte_cryptodev_sym_session_init(vcrypto->cid, session, &xform1, + vcrypto->sess_pool) < 0) { + VC_LOG_ERR("Failed to initialize session"); + sess_param->session_id = -VIRTIO_CRYPTO_ERR; + return; + } + + /* insert hash to map */ + if (rte_hash_add_key_data(vcrypto->session_map, + &vcrypto->last_session_id, session) < 0) { + VC_LOG_ERR("Failed to insert session to hash table"); + + if (rte_cryptodev_sym_session_clear(vcrypto->cid, session) < 0) + VC_LOG_ERR("Failed to clear session"); + else { + if (rte_cryptodev_sym_session_free(session) < 0) + VC_LOG_ERR("Failed to free session"); + } + sess_param->session_id = -VIRTIO_CRYPTO_ERR; + return; + } + + VC_LOG_INFO("Session %"PRIu64" created for vdev %i.", + vcrypto->last_session_id, vcrypto->dev->vid); + + sess_param->session_id = vcrypto->last_session_id; + vcrypto->last_session_id++; +} + +static int +vhost_crypto_close_sess(struct vhost_crypto *vcrypto, uint64_t session_id) +{ + struct rte_cryptodev_sym_session *session; + uint64_t sess_id = session_id; + int ret; + + ret = rte_hash_lookup_data(vcrypto->session_map, &sess_id, + (void **)&session); + + if (unlikely(ret < 0)) { + VC_LOG_ERR("Failed to delete session %"PRIu64".", session_id); + return -VIRTIO_CRYPTO_INVSESS; + } + + if (rte_cryptodev_sym_session_clear(vcrypto->cid, session) < 0) { + VC_LOG_DBG("Failed to clear session"); + return -VIRTIO_CRYPTO_ERR; + } + + if (rte_cryptodev_sym_session_free(session) < 0) { + VC_LOG_DBG("Failed to free session"); + return -VIRTIO_CRYPTO_ERR; + } + + if (rte_hash_del_key(vcrypto->session_map, &sess_id) < 0) { + VC_LOG_DBG("Failed to delete session from hash table."); + return -VIRTIO_CRYPTO_ERR; + } + + VC_LOG_INFO("Session %"PRIu64" deleted for vdev %i.", sess_id, + vcrypto->dev->vid); + + return 0; +} + +static int +vhost_crypto_msg_post_handler(int vid, void *msg, uint32_t *require_reply) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + VhostUserMsg *vmsg = msg; + int ret = 0; + + if (dev == NULL || require_reply == NULL) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + vcrypto = dev->extern_data; + if (vcrypto == NULL) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + *require_reply = 0; + + if (vmsg->request.master == VHOST_USER_CRYPTO_CREATE_SESS) { + vhost_crypto_create_sess(vcrypto, + &vmsg->payload.crypto_session); + *require_reply = 1; + } else if (vmsg->request.master == VHOST_USER_CRYPTO_CLOSE_SESS) + ret = vhost_crypto_close_sess(vcrypto, vmsg->payload.u64); + else + ret = -EINVAL; + + return ret; +} + +static __rte_always_inline struct vring_desc * +find_write_desc(struct vring_desc *head, struct vring_desc *desc) +{ + if (desc->flags & VRING_DESC_F_WRITE) + return desc; + + while (desc->flags & VRING_DESC_F_NEXT) { + desc = &head[desc->next]; + if (desc->flags & VRING_DESC_F_WRITE) + return desc; + } + + return NULL; +} + +static struct virtio_crypto_inhdr * +reach_inhdr(struct vhost_crypto_data_req *vc_req, struct vring_desc *desc) +{ + uint64_t dlen; + struct virtio_crypto_inhdr *inhdr; + + while (desc->flags & VRING_DESC_F_NEXT) + desc = &vc_req->head[desc->next]; + + dlen = desc->len; + inhdr = IOVA_TO_VVA(struct virtio_crypto_inhdr *, vc_req, desc->addr, + &dlen, VHOST_ACCESS_WO); + if (unlikely(!inhdr || dlen != desc->len)) + return NULL; + + return inhdr; +} + +static __rte_always_inline int +move_desc(struct vring_desc *head, struct vring_desc **cur_desc, + uint32_t size) +{ + struct vring_desc *desc = *cur_desc; + int left = size; + + rte_prefetch0(&head[desc->next]); + left -= desc->len; + + while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) { + desc = &head[desc->next]; + rte_prefetch0(&head[desc->next]); + left -= desc->len; + } + + if (unlikely(left > 0)) { + VC_LOG_ERR("Incorrect virtio descriptor"); + return -1; + } + + *cur_desc = &head[desc->next]; + return 0; +} + +static int +copy_data(void *dst_data, struct vhost_crypto_data_req *vc_req, + struct vring_desc **cur_desc, uint32_t size) +{ + struct vring_desc *desc = *cur_desc; + uint64_t remain, addr, dlen, len; + uint32_t to_copy; + uint8_t *data = dst_data; + uint8_t *src; + int left = size; + + rte_prefetch0(&vc_req->head[desc->next]); + to_copy = RTE_MIN(desc->len, (uint32_t)left); + dlen = to_copy; + src = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RO); + if (unlikely(!src || !dlen)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy((uint8_t *)data, src, dlen); + data += dlen; + + if (unlikely(dlen < to_copy)) { + remain = to_copy - dlen; + addr = desc->addr + dlen; + + while (remain) { + len = remain; + src = IOVA_TO_VVA(uint8_t *, vc_req, addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(data, src, len); + addr += len; + remain -= len; + data += len; + } + } + + left -= to_copy; + + while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) { + desc = &vc_req->head[desc->next]; + rte_prefetch0(&vc_req->head[desc->next]); + to_copy = RTE_MIN(desc->len, (uint32_t)left); + dlen = desc->len; + src = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RO); + if (unlikely(!src || !dlen)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(data, src, dlen); + data += dlen; + + if (unlikely(dlen < to_copy)) { + remain = to_copy - dlen; + addr = desc->addr + dlen; + + while (remain) { + len = remain; + src = IOVA_TO_VVA(uint8_t *, vc_req, addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(data, src, len); + addr += len; + remain -= len; + data += len; + } + } + + left -= to_copy; + } + + if (unlikely(left > 0)) { + VC_LOG_ERR("Incorrect virtio descriptor"); + return -1; + } + + *cur_desc = &vc_req->head[desc->next]; + + return 0; +} + +static __rte_always_inline void * +get_data_ptr(struct vhost_crypto_data_req *vc_req, struct vring_desc **cur_desc, + uint32_t size, uint8_t perm) +{ + void *data; + uint64_t dlen = (*cur_desc)->len; + + data = IOVA_TO_VVA(void *, vc_req, (*cur_desc)->addr, &dlen, perm); + if (unlikely(!data || dlen != (*cur_desc)->len)) { + VC_LOG_ERR("Failed to map object"); + return NULL; + } + + if (unlikely(move_desc(vc_req->head, cur_desc, size) < 0)) + return NULL; + + return data; +} + +static int +write_back_data(struct rte_crypto_op *op, struct vhost_crypto_data_req *vc_req) +{ + struct rte_mbuf *mbuf = op->sym->m_dst; + struct vring_desc *head = vc_req->head; + struct vring_desc *desc = vc_req->wb_desc; + int left = vc_req->wb_len; + uint32_t to_write; + uint8_t *src_data = mbuf->buf_addr, *dst; + uint64_t dlen; + + rte_prefetch0(&head[desc->next]); + to_write = RTE_MIN(desc->len, (uint32_t)left); + dlen = desc->len; + dst = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RW); + if (unlikely(!dst || dlen != desc->len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(dst, src_data, to_write); + left -= to_write; + src_data += to_write; + + while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) { + desc = &head[desc->next]; + rte_prefetch0(&head[desc->next]); + to_write = RTE_MIN(desc->len, (uint32_t)left); + dlen = desc->len; + dst = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RW); + if (unlikely(!dst || dlen != desc->len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(dst, src_data, to_write); + left -= to_write; + src_data += to_write; + } + + if (unlikely(left < 0)) { + VC_LOG_ERR("Incorrect virtio descriptor"); + return -1; + } + + return 0; +} + +static uint8_t +prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op, + struct vhost_crypto_data_req *vc_req, + struct virtio_crypto_cipher_data_req *cipher, + struct vring_desc *cur_desc) +{ + struct vring_desc *desc = cur_desc; + struct rte_mbuf *m_src = op->sym->m_src, *m_dst = op->sym->m_dst; + uint8_t *iv_data = rte_crypto_op_ctod_offset(op, uint8_t *, IV_OFFSET); + uint8_t ret = 0; + + /* prepare */ + /* iv */ + if (unlikely(copy_data(iv_data, vc_req, &desc, + cipher->para.iv_len) < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + m_src->data_len = cipher->para.src_data_len; + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr, + cipher->para.src_data_len); + m_src->buf_addr = get_data_ptr(vc_req, &desc, + cipher->para.src_data_len, VHOST_ACCESS_RO); + if (unlikely(m_src->buf_iova == 0 || + m_src->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + if (unlikely(cipher->para.src_data_len > + RTE_MBUF_DEFAULT_BUF_SIZE)) { + VC_LOG_ERR("Not enough space to do data copy"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + if (unlikely(copy_data(rte_pktmbuf_mtod(m_src, uint8_t *), + vc_req, &desc, cipher->para.src_data_len) + < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* dst */ + desc = find_write_desc(vc_req->head, desc); + if (unlikely(!desc)) { + VC_LOG_ERR("Cannot find write location"); + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_dst->buf_iova = gpa_to_hpa(vcrypto->dev, + desc->addr, cipher->para.dst_data_len); + m_dst->buf_addr = get_data_ptr(vc_req, &desc, + cipher->para.dst_data_len, VHOST_ACCESS_RW); + if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + + m_dst->data_len = cipher->para.dst_data_len; + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + vc_req->wb_desc = desc; + vc_req->wb_len = cipher->para.dst_data_len; + if (unlikely(move_desc(vc_req->head, &desc, + vc_req->wb_len) < 0)) { + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* src data */ + op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC; + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + + op->sym->cipher.data.offset = 0; + op->sym->cipher.data.length = cipher->para.src_data_len; + + vc_req->inhdr = get_data_ptr(vc_req, &desc, INHDR_LEN, VHOST_ACCESS_WO); + if (unlikely(vc_req->inhdr == NULL)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + vc_req->inhdr->status = VIRTIO_CRYPTO_OK; + vc_req->len = cipher->para.dst_data_len + INHDR_LEN; + + return 0; + +error_exit: + vc_req->len = INHDR_LEN; + return ret; +} + +static uint8_t +prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op, + struct vhost_crypto_data_req *vc_req, + struct virtio_crypto_alg_chain_data_req *chain, + struct vring_desc *cur_desc) +{ + struct vring_desc *desc = cur_desc; + struct rte_mbuf *m_src = op->sym->m_src, *m_dst = op->sym->m_dst; + uint8_t *iv_data = rte_crypto_op_ctod_offset(op, uint8_t *, IV_OFFSET); + uint32_t digest_offset; + void *digest_addr; + uint8_t ret = 0; + + /* prepare */ + /* iv */ + if (unlikely(copy_data(iv_data, vc_req, &desc, + chain->para.iv_len) < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + m_src->data_len = chain->para.src_data_len; + m_dst->data_len = chain->para.dst_data_len; + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr, + chain->para.src_data_len); + m_src->buf_addr = get_data_ptr(vc_req, &desc, + chain->para.src_data_len, VHOST_ACCESS_RO); + if (unlikely(m_src->buf_iova == 0 || m_src->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + if (unlikely(chain->para.src_data_len > + RTE_MBUF_DEFAULT_BUF_SIZE)) { + VC_LOG_ERR("Not enough space to do data copy"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + if (unlikely(copy_data(rte_pktmbuf_mtod(m_src, uint8_t *), + vc_req, &desc, chain->para.src_data_len)) < 0) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* dst */ + desc = find_write_desc(vc_req->head, desc); + if (unlikely(!desc)) { + VC_LOG_ERR("Cannot find write location"); + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_dst->buf_iova = gpa_to_hpa(vcrypto->dev, + desc->addr, chain->para.dst_data_len); + m_dst->buf_addr = get_data_ptr(vc_req, &desc, + chain->para.dst_data_len, VHOST_ACCESS_RW); + if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + + op->sym->auth.digest.phys_addr = gpa_to_hpa(vcrypto->dev, + desc->addr, chain->para.hash_result_len); + op->sym->auth.digest.data = get_data_ptr(vc_req, &desc, + chain->para.hash_result_len, VHOST_ACCESS_RW); + if (unlikely(op->sym->auth.digest.phys_addr == 0)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + digest_offset = m_dst->data_len; + digest_addr = rte_pktmbuf_mtod_offset(m_dst, void *, + digest_offset); + + vc_req->wb_desc = desc; + vc_req->wb_len = m_dst->data_len + chain->para.hash_result_len; + + if (unlikely(move_desc(vc_req->head, &desc, + chain->para.dst_data_len) < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + if (unlikely(copy_data(digest_addr, vc_req, &desc, + chain->para.hash_result_len)) < 0) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + op->sym->auth.digest.data = digest_addr; + op->sym->auth.digest.phys_addr = rte_pktmbuf_iova_offset(m_dst, + digest_offset); + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* record inhdr */ + vc_req->inhdr = get_data_ptr(vc_req, &desc, INHDR_LEN, VHOST_ACCESS_WO); + if (unlikely(vc_req->inhdr == NULL)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + vc_req->inhdr->status = VIRTIO_CRYPTO_OK; + + op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC; + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + + op->sym->cipher.data.offset = chain->para.cipher_start_src_offset; + op->sym->cipher.data.length = chain->para.src_data_len - + chain->para.cipher_start_src_offset; + + op->sym->auth.data.offset = chain->para.hash_start_src_offset; + op->sym->auth.data.length = chain->para.len_to_hash; + + vc_req->len = chain->para.dst_data_len + chain->para.hash_result_len + + INHDR_LEN; + return 0; + +error_exit: + vc_req->len = INHDR_LEN; + return ret; +} + +/** + * Process on descriptor + */ +static __rte_always_inline int +vhost_crypto_process_one_req(struct vhost_crypto *vcrypto, + struct vhost_virtqueue *vq, struct rte_crypto_op *op, + struct vring_desc *head, uint16_t desc_idx) +{ + struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(op->sym->m_src); + struct rte_cryptodev_sym_session *session; + struct virtio_crypto_op_data_req *req, tmp_req; + struct virtio_crypto_inhdr *inhdr; + struct vring_desc *desc = NULL; + uint64_t session_id; + uint64_t dlen; + int err = 0; + + vc_req->desc_idx = desc_idx; + vc_req->dev = vcrypto->dev; + vc_req->vq = vq; + + if (likely(head->flags & VRING_DESC_F_INDIRECT)) { + dlen = head->len; + desc = IOVA_TO_VVA(struct vring_desc *, vc_req, head->addr, + &dlen, VHOST_ACCESS_RO); + if (unlikely(!desc || dlen != head->len)) + return -1; + desc_idx = 0; + head = desc; + } else { + desc = head; + } + + vc_req->head = head; + vc_req->zero_copy = vcrypto->option; + + req = get_data_ptr(vc_req, &desc, sizeof(*req), VHOST_ACCESS_RO); + if (unlikely(req == NULL)) { + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + err = VIRTIO_CRYPTO_BADMSG; + VC_LOG_ERR("Invalid descriptor"); + goto error_exit; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + req = &tmp_req; + if (unlikely(copy_data(req, vc_req, &desc, sizeof(*req)) + < 0)) { + err = VIRTIO_CRYPTO_BADMSG; + VC_LOG_ERR("Invalid descriptor"); + goto error_exit; + } + break; + default: + err = VIRTIO_CRYPTO_ERR; + VC_LOG_ERR("Invalid option"); + goto error_exit; + } + } + + switch (req->header.opcode) { + case VIRTIO_CRYPTO_CIPHER_ENCRYPT: + case VIRTIO_CRYPTO_CIPHER_DECRYPT: + session_id = req->header.session_id; + + /* one branch to avoid unnecessary table lookup */ + if (vcrypto->cache_session_id != session_id) { + err = rte_hash_lookup_data(vcrypto->session_map, + &session_id, (void **)&session); + if (unlikely(err < 0)) { + err = VIRTIO_CRYPTO_ERR; + VC_LOG_ERR("Failed to find session %"PRIu64, + session_id); + goto error_exit; + } + + vcrypto->cache_session = session; + vcrypto->cache_session_id = session_id; + } + + session = vcrypto->cache_session; + + err = rte_crypto_op_attach_sym_session(op, session); + if (unlikely(err < 0)) { + err = VIRTIO_CRYPTO_ERR; + VC_LOG_ERR("Failed to attach session to op"); + goto error_exit; + } + + switch (req->u.sym_req.op_type) { + case VIRTIO_CRYPTO_SYM_OP_NONE: + err = VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_SYM_OP_CIPHER: + err = prepare_sym_cipher_op(vcrypto, op, vc_req, + &req->u.sym_req.u.cipher, desc); + break; + case VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING: + err = prepare_sym_chain_op(vcrypto, op, vc_req, + &req->u.sym_req.u.chain, desc); + break; + } + if (unlikely(err != 0)) { + VC_LOG_ERR("Failed to process sym request"); + goto error_exit; + } + break; + default: + VC_LOG_ERR("Unsupported symmetric crypto request type %u", + req->header.opcode); + goto error_exit; + } + + return 0; + +error_exit: + + inhdr = reach_inhdr(vc_req, desc); + if (likely(inhdr != NULL)) + inhdr->status = (uint8_t)err; + + return -1; +} + +static __rte_always_inline struct vhost_virtqueue * +vhost_crypto_finalize_one_request(struct rte_crypto_op *op, + struct vhost_virtqueue *old_vq) +{ + struct rte_mbuf *m_src = op->sym->m_src; + struct rte_mbuf *m_dst = op->sym->m_dst; + struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(m_src); + uint16_t desc_idx; + int ret = 0; + + if (unlikely(!vc_req)) { + VC_LOG_ERR("Failed to retrieve vc_req"); + return NULL; + } + + if (old_vq && (vc_req->vq != old_vq)) + return vc_req->vq; + + desc_idx = vc_req->desc_idx; + + if (unlikely(op->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) + vc_req->inhdr->status = VIRTIO_CRYPTO_ERR; + else { + if (vc_req->zero_copy == 0) { + ret = write_back_data(op, vc_req); + if (unlikely(ret != 0)) + vc_req->inhdr->status = VIRTIO_CRYPTO_ERR; + } + } + + vc_req->vq->used->ring[desc_idx].id = desc_idx; + vc_req->vq->used->ring[desc_idx].len = vc_req->len; + + rte_mempool_put(m_dst->pool, (void *)m_dst); + rte_mempool_put(m_src->pool, (void *)m_src); + + return vc_req->vq; +} + +static __rte_always_inline uint16_t +vhost_crypto_complete_one_vm_requests(struct rte_crypto_op **ops, + uint16_t nb_ops, int *callfd) +{ + uint16_t processed = 1; + struct vhost_virtqueue *vq, *tmp_vq; + + if (unlikely(nb_ops == 0)) + return 0; + + vq = vhost_crypto_finalize_one_request(ops[0], NULL); + if (unlikely(vq == NULL)) + return 0; + tmp_vq = vq; + + while ((processed < nb_ops)) { + tmp_vq = vhost_crypto_finalize_one_request(ops[processed], + tmp_vq); + + if (unlikely(vq != tmp_vq)) + break; + + processed++; + } + + *callfd = vq->callfd; + + *(volatile uint16_t *)&vq->used->idx += processed; + + return processed; +} + +int __rte_experimental +rte_vhost_crypto_create(int vid, uint8_t cryptodev_id, + struct rte_mempool *sess_pool, int socket_id) +{ + struct virtio_net *dev = get_device(vid); + struct rte_hash_parameters params = {0}; + struct vhost_crypto *vcrypto; + char name[128]; + int ret; + + if (!dev) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + ret = rte_vhost_driver_set_features(dev->ifname, + VIRTIO_CRYPTO_FEATURES); + if (ret < 0) { + VC_LOG_ERR("Error setting features"); + return -1; + } + + vcrypto = rte_zmalloc_socket(NULL, sizeof(*vcrypto), + RTE_CACHE_LINE_SIZE, socket_id); + if (!vcrypto) { + VC_LOG_ERR("Insufficient memory"); + return -ENOMEM; + } + + vcrypto->sess_pool = sess_pool; + vcrypto->cid = cryptodev_id; + vcrypto->cache_session_id = UINT64_MAX; + vcrypto->last_session_id = 1; + vcrypto->dev = dev; + vcrypto->option = RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE; + + snprintf(name, 127, "HASH_VHOST_CRYPT_%u", (uint32_t)vid); + params.name = name; + params.entries = VHOST_CRYPTO_SESSION_MAP_ENTRIES; + params.hash_func = rte_jhash; + params.key_len = sizeof(uint64_t); + params.socket_id = socket_id; + vcrypto->session_map = rte_hash_create(¶ms); + if (!vcrypto->session_map) { + VC_LOG_ERR("Failed to creath session map"); + ret = -ENOMEM; + goto error_exit; + } + + snprintf(name, 127, "MBUF_POOL_VM_%u", (uint32_t)vid); + vcrypto->mbuf_pool = rte_pktmbuf_pool_create(name, + VHOST_CRYPTO_MBUF_POOL_SIZE, 512, + sizeof(struct vhost_crypto_data_req), + RTE_MBUF_DEFAULT_DATAROOM * 2 + RTE_PKTMBUF_HEADROOM, + rte_socket_id()); + if (!vcrypto->mbuf_pool) { + VC_LOG_ERR("Failed to creath mbuf pool"); + ret = -ENOMEM; + goto error_exit; + } + + dev->extern_data = vcrypto; + dev->extern_ops.pre_msg_handle = NULL; + dev->extern_ops.post_msg_handle = vhost_crypto_msg_post_handler; + + return 0; + +error_exit: + if (vcrypto->session_map) + rte_hash_free(vcrypto->session_map); + if (vcrypto->mbuf_pool) + rte_mempool_free(vcrypto->mbuf_pool); + + rte_free(vcrypto); + + return ret; +} + +int __rte_experimental +rte_vhost_crypto_free(int vid) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + vcrypto = dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + rte_hash_free(vcrypto->session_map); + rte_mempool_free(vcrypto->mbuf_pool); + rte_free(vcrypto); + + dev->extern_data = NULL; + dev->extern_ops.pre_msg_handle = NULL; + dev->extern_ops.post_msg_handle = NULL; + + return 0; +} + +int __rte_experimental +rte_vhost_crypto_set_zero_copy(int vid, enum rte_vhost_crypto_zero_copy option) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + if (unlikely((uint32_t)option >= + RTE_VHOST_CRYPTO_MAX_ZERO_COPY_OPTIONS)) { + VC_LOG_ERR("Invalid option %i", option); + return -EINVAL; + } + + vcrypto = (struct vhost_crypto *)dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + if (vcrypto->option == (uint8_t)option) + return 0; + + if (!(rte_mempool_full(vcrypto->mbuf_pool))) { + VC_LOG_ERR("Cannot update zero copy as mempool is not full"); + return -EINVAL; + } + + vcrypto->option = (uint8_t)option; + + return 0; +} + +uint16_t __rte_experimental +rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + struct rte_crypto_op **ops, uint16_t nb_ops) +{ + struct rte_mbuf *mbufs[VHOST_CRYPTO_MAX_BURST_SIZE * 2]; + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + struct vhost_virtqueue *vq; + uint16_t avail_idx; + uint16_t start_idx; + uint16_t required; + uint16_t count; + uint16_t i; + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + if (unlikely(qid >= VHOST_MAX_QUEUE_PAIRS)) { + VC_LOG_ERR("Invalid qid %u", qid); + return -EINVAL; + } + + vcrypto = (struct vhost_crypto *)dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + vq = dev->virtqueue[qid]; + + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + start_idx = vq->last_used_idx; + count = avail_idx - start_idx; + count = RTE_MIN(count, VHOST_CRYPTO_MAX_BURST_SIZE); + count = RTE_MIN(count, nb_ops); + + if (unlikely(count == 0)) + return 0; + + /* for zero copy, we need 2 empty mbufs for src and dst, otherwise + * we need only 1 mbuf as src and dst + */ + required = count * 2; + if (unlikely(rte_mempool_get_bulk(vcrypto->mbuf_pool, (void **)mbufs, + required) < 0)) { + VC_LOG_ERR("Insufficient memory"); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + uint16_t used_idx = (start_idx + i) & (vq->size - 1); + uint16_t desc_idx = vq->avail->ring[used_idx]; + struct vring_desc *head = &vq->desc[desc_idx]; + struct rte_crypto_op *op = ops[i]; + + op->sym->m_src = mbufs[i * 2]; + op->sym->m_dst = mbufs[i * 2 + 1]; + op->sym->m_src->data_off = 0; + op->sym->m_dst->data_off = 0; + + if (unlikely(vhost_crypto_process_one_req(vcrypto, vq, op, head, + desc_idx)) < 0) + break; + } + + vq->last_used_idx += i; + + return i; +} + +uint16_t __rte_experimental +rte_vhost_crypto_finalize_requests(struct rte_crypto_op **ops, + uint16_t nb_ops, int *callfds, uint16_t *nb_callfds) +{ + struct rte_crypto_op **tmp_ops = ops; + uint16_t count = 0, left = nb_ops; + int callfd; + uint16_t idx = 0; + + while (left) { + count = vhost_crypto_complete_one_vm_requests(tmp_ops, left, + &callfd); + if (unlikely(count == 0)) + break; + + tmp_ops = &tmp_ops[count]; + left -= count; + + callfds[idx++] = callfd; + + if (unlikely(idx >= VIRTIO_CRYPTO_MAX_NUM_BURST_VQS)) { + VC_LOG_ERR("Too many vqs"); + break; + } + } + + *nb_callfds = idx; + + return nb_ops - left; +} diff --git a/src/spdk/dpdk/lib/librte_vhost/vhost_user.c b/src/spdk/dpdk/lib/librte_vhost/vhost_user.c new file mode 100644 index 00000000..a2d4c9ff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/vhost_user.c @@ -0,0 +1,1958 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +/* Security model + * -------------- + * The vhost-user protocol connection is an external interface, so it must be + * robust against invalid inputs. + * + * This is important because the vhost-user master is only one step removed + * from the guest. Malicious guests that have escaped will then launch further + * attacks from the vhost-user master. + * + * Even in deployments where guests are trusted, a bug in the vhost-user master + * can still cause invalid messages to be sent. Such messages must not + * compromise the stability of the DPDK application by causing crashes, memory + * corruption, or other problematic behavior. + * + * Do not assume received VhostUserMsg fields contain sensible values! + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include + +#include "iotlb.h" +#include "vhost.h" +#include "vhost_user.h" + +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", + [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD", + [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", + [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", + [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct rte_vhost_mem_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + free(dev->guest_pages); + dev->guest_pages = NULL; + + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } + + if (dev->slave_req_fd >= 0) { + close(dev->slave_req_fd); + dev->slave_req_fd = -1; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + vhost_destroy_device_notify(dev); + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(struct virtio_net *dev) +{ + uint64_t features = 0; + + rte_vhost_driver_get_features(dev->ifname, &features); + return features; +} + +/* + * The queue number that we support are requested. + */ +static uint32_t +vhost_user_get_queue_num(struct virtio_net *dev) +{ + uint32_t queue_num = 0; + + rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); + return queue_num; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + uint64_t vhost_features = 0; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + + rte_vhost_driver_get_features(dev->ifname, &vhost_features); + if (features & ~vhost_features) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) received invalid negotiated features.\n", + dev->vid); + return -1; + } + + if (dev->flags & VIRTIO_DEV_RUNNING) { + if (dev->features == features) + return 0; + + /* + * Error out if master tries to change features while device is + * in running state. The exception being VHOST_F_LOG_ALL, which + * is enabled when the live-migration starts. + */ + if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) features changed while device is running.\n", + dev->vid); + return -1; + } + + if (dev->notify_ops->features_changed) + dev->notify_ops->features_changed(dev->vid, features); + } + + dev->features = features; + if (dev->features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + VHOST_LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) && + !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) { + /* + * Remove all but first queue pair if MQ hasn't been + * negotiated. This is safe because the device is not + * running at this stage. + */ + while (dev->nr_vring > 2) { + struct vhost_virtqueue *vq; + + vq = dev->virtqueue[--dev->nr_vring]; + if (!vq) + continue; + + dev->virtqueue[dev->nr_vring] = NULL; + cleanup_vq(vq, 1); + free_vq(dev, vq); + } + } + + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->set_features) + vdpa_dev->ops->set_features(dev->vid); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + vq->size = msg->payload.state.num; + + /* VIRTIO 1.0, 2.4 Virtqueues says: + * + * Queue Size value is always a power of 2. The maximum Queue Size + * value is 32768. + */ + if ((vq->size & (vq->size - 1)) || vq->size > 32768) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid virtqueue size %u\n", vq->size); + return -1; + } + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + TAILQ_INIT(&vq->zmbuf_list); + } + + if (vq_is_packed(dev)) { + vq->shadow_used_packed = rte_malloc(NULL, + vq->size * + sizeof(struct vring_used_elem_packed), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_packed) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + } else { + vq->shadow_used_split = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_split) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + } + + vq->batch_copy_elems = rte_malloc(NULL, + vq->size * sizeof(struct batch_copy_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->batch_copy_elems) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for batching copy.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + struct zcopy_mbuf *new_zmbuf; + struct vring_used_elem *new_shadow_used_split; + struct vring_used_elem_packed *new_shadow_used_packed; + struct batch_copy_elem *new_batch_copy_elems; + int ret; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq)); + TAILQ_INIT(&vq->zmbuf_list); + + new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0, newnode); + if (new_zmbuf) { + rte_free(vq->zmbufs); + vq->zmbufs = new_zmbuf; + } + + if (vq_is_packed(dev)) { + new_shadow_used_packed = rte_malloc_socket(NULL, + vq->size * + sizeof(struct vring_used_elem_packed), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_shadow_used_packed) { + rte_free(vq->shadow_used_packed); + vq->shadow_used_packed = new_shadow_used_packed; + } + } else { + new_shadow_used_split = rte_malloc_socket(NULL, + vq->size * + sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_shadow_used_split) { + rte_free(vq->shadow_used_split); + vq->shadow_used_split = new_shadow_used_split; + } + } + + new_batch_copy_elems = rte_malloc_socket(NULL, + vq->size * sizeof(struct batch_copy_elem), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_batch_copy_elems) { + rte_free(vq->batch_copy_elems); + vq->batch_copy_elems = new_batch_copy_elems; + } + + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + vhost_devices[dev->vid] = dev; + + if (old_vq != vq) + vhost_user_iotlb_init(dev, index); + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* Converts QEMU virtual address to Vhost virtual address. */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + r = &dev->mem->regions[i]; + + if (qva >= r->guest_user_addr && + qva < r->guest_user_addr + r->size) { + + if (unlikely(*len > r->guest_user_addr + r->size - qva)) + *len = r->guest_user_addr + r->size - qva; + + return qva - r->guest_user_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + + +/* + * Converts ring address to Vhost virtual address. + * If IOMMU is enabled, the ring address is a guest IO virtual address, + * else it is a QEMU virtual address. + */ +static uint64_t +ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t ra, uint64_t *size) +{ + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { + uint64_t vva; + + vva = vhost_user_iotlb_cache_find(vq, ra, + size, VHOST_ACCESS_RW); + if (!vva) + vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW); + + return vva; + } + + return qva_to_vva(dev, ra, size); +} + +static struct virtio_net * +translate_ring_addresses(struct virtio_net *dev, int vq_index) +{ + struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; + struct vhost_vring_addr *addr = &vq->ring_addrs; + uint64_t len; + + if (vq_is_packed(dev)) { + len = sizeof(struct vring_packed_desc) * vq->size; + vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) + ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); + vq->log_guest_addr = 0; + if (vq->desc_packed == NULL || + len != sizeof(struct vring_packed_desc) * + vq->size) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map desc_packed ring.\n", + dev->vid); + return dev; + } + + dev = numa_realloc(dev, vq_index); + vq = dev->virtqueue[vq_index]; + addr = &vq->ring_addrs; + + len = sizeof(struct vring_packed_desc_event); + vq->driver_event = (struct vring_packed_desc_event *) + (uintptr_t)ring_addr_to_vva(dev, + vq, addr->avail_user_addr, &len); + if (vq->driver_event == NULL || + len != sizeof(struct vring_packed_desc_event)) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to find driver area address.\n", + dev->vid); + return dev; + } + + len = sizeof(struct vring_packed_desc_event); + vq->device_event = (struct vring_packed_desc_event *) + (uintptr_t)ring_addr_to_vva(dev, + vq, addr->used_user_addr, &len); + if (vq->device_event == NULL || + len != sizeof(struct vring_packed_desc_event)) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to find device area address.\n", + dev->vid); + return dev; + } + + return dev; + } + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + if (vq->desc && vq->avail && vq->used) + return dev; + + len = sizeof(struct vring_desc) * vq->size; + vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, + vq, addr->desc_user_addr, &len); + if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map desc ring.\n", + dev->vid); + return dev; + } + + dev = numa_realloc(dev, vq_index); + vq = dev->virtqueue[vq_index]; + addr = &vq->ring_addrs; + + len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; + vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, + vq, addr->avail_user_addr, &len); + if (vq->avail == 0 || + len != sizeof(struct vring_avail) + + sizeof(uint16_t) * vq->size) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map avail ring.\n", + dev->vid); + return dev; + } + + len = sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size; + vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, + vq, addr->used_user_addr, &len); + if (vq->used == 0 || len != sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map used ring.\n", + dev->vid); + return dev; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = addr->log_guest_addr; + + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return dev; +} + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq; + struct vhost_vring_addr *addr = &msg->payload.addr; + struct virtio_net *dev = *pdev; + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[msg->payload.addr.index]; + + /* + * Rings addresses should not be interpreted as long as the ring is not + * started and enabled + */ + memcpy(&vq->ring_addrs, addr, sizeof(*addr)); + + vring_invalidate(dev, vq); + + if (vq->enabled && (dev->features & + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { + dev = translate_ring_addresses(dev, msg->payload.addr.index); + if (!dev) + return -1; + + *pdev = dev; + } + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + dev->virtqueue[msg->payload.state.index]->last_used_idx = + msg->payload.state.num; + dev->virtqueue[msg->payload.state.index]->last_avail_idx = + msg->payload.state.num; + + return 0; +} + +static int +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages *= 2; + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + if (!dev->guest_pages) { + RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n"); + return -1; + } + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return 0; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; + + return 0; +} + +static int +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0) + return -1; + + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + size = RTE_MIN(reg_size, page_size); + host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t) + host_user_addr); + if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, + size) < 0) + return -1; + + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + } + + return 0; +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static bool +vhost_memory_changed(struct VhostUserMemory *new, + struct rte_vhost_memory *old) +{ + uint32_t i; + + if (new->nregions != old->nregions) + return true; + + for (i = 0; i < new->nregions; ++i) { + VhostUserMemoryRegion *new_r = &new->regions[i]; + struct rte_vhost_mem_region *old_r = &old->regions[i]; + + if (new_r->guest_phys_addr != old_r->guest_phys_addr) + return true; + if (new_r->memory_size != old_r->size) + return true; + if (new_r->userspace_addr != old_r->guest_user_addr) + return true; + } + + return false; +} + +static int +vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) +{ + struct virtio_net *dev = *pdev; + struct VhostUserMemory memory = pmsg->payload.memory; + struct rte_vhost_mem_region *reg; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int populate; + int fd; + + if (memory.nregions > VHOST_MEMORY_MAX_NREGIONS) { + RTE_LOG(ERR, VHOST_CONFIG, + "too many memory regions (%u)\n", memory.nregions); + return -1; + } + + if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) { + RTE_LOG(INFO, VHOST_CONFIG, + "(%d) memory regions not changed\n", dev->vid); + + for (i = 0; i < memory.nregions; i++) + close(pmsg->fds[i]); + + return 0; + } + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + /* Flush IOTLB cache as previous HVAs are now invalid */ + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + for (i = 0; i < dev->nr_vring; i++) + vhost_user_iotlb_flush_all(dev->virtqueue[i]); + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + if (dev->guest_pages == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory " + "for dev->guest_pages\n", + dev->vid); + return -1; + } + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = pmsg->fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + + /* Check for memory_size + mmap_offset overflow */ + if (mmap_offset >= -reg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap_offset (%#"PRIx64") and memory_size " + "(%#"PRIx64") overflow\n", + mmap_offset, reg->size); + goto err_mmap; + } + + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0; + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | populate, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + if (dev->dequeue_zero_copy) + if (add_guest_pages(dev, reg, alignment) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "adding guest pages to region %u failed.\n", + i); + goto err_mmap; + } + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + for (i = 0; i < dev->nr_vring; i++) { + struct vhost_virtqueue *vq = dev->virtqueue[i]; + + if (vq->desc || vq->avail || vq->used) { + /* + * If the memory table got updated, the ring addresses + * need to be translated again as virtual addresses have + * changed. + */ + vring_invalidate(dev, vq); + + dev = translate_ring_addresses(dev, i); + if (!dev) + return -1; + + *pdev = dev; + } + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static bool +vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + bool rings_ok; + + if (!vq) + return false; + + if (vq_is_packed(dev)) + rings_ok = !!vq->desc_packed; + else + rings_ok = vq->desc && vq->avail && vq->used; + + return rings_ok && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *vq; + uint32_t i; + + if (dev->nr_vring == 0) + return 0; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + if (!vq_is_ready(dev, vq)) + return 0; + } + + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; +} + +static void +vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + struct virtio_net *dev = *pdev; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + /* Interpret ring addresses only when ring is started. */ + dev = translate_ring_addresses(dev, file.index); + if (!dev) + return; + + *pdev = dev; + + vq = dev->virtqueue[file.index]; + + /* + * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated, + * the ring starts already enabled. Otherwise, it is enabled via + * the SET_VRING_ENABLE message. + */ + if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) + vq->enabled = 1; + + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + /* We have to stop the queue (virtio) if it is running. */ + vhost_destroy_device_notify(dev); + + dev->flags &= ~VIRTIO_DEV_READY; + dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; + + /* Here we are safe to get the last avail index */ + msg->payload.state.num = vq->last_avail_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", msg->payload.state.index, + msg->payload.state.num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + if (vq_is_packed(dev)) { + rte_free(vq->shadow_used_packed); + vq->shadow_used_packed = NULL; + } else { + rte_free(vq->shadow_used_split); + vq->shadow_used_split = NULL; + } + + rte_free(vq->batch_copy_elems); + vq->batch_copy_elems = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + VhostUserMsg *msg) +{ + int enable = (int)msg->payload.state.num; + int index = (int)msg->payload.state.index; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, index); + + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->set_vring_state) + vdpa_dev->ops->set_vring_state(dev->vid, index, enable); + + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, + index, enable); + + dev->virtqueue[index]->enabled = enable; + + return 0; +} + +static void +vhost_user_get_protocol_features(struct virtio_net *dev, + struct VhostUserMsg *msg) +{ + uint64_t features, protocol_features; + + rte_vhost_driver_get_features(dev->ifname, &features); + rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); + + /* + * REPLY_ACK protocol feature is only mandatory for now + * for IOMMU feature. If IOMMU is explicitly disabled by the + * application, disable also REPLY_ACK feature for older buggy + * Qemu versions (from v2.7.0 to v2.9.0). + */ + if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK); + + msg->payload.u64 = protocol_features; + msg->size = sizeof(msg->payload.u64); +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + + /* Don't allow mmap_offset to point outside the mmap region */ + if (off > size) { + RTE_LOG(ERR, VHOST_CONFIG, + "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n", + off, size); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->migration_done) + vdpa_dev->ops->migration_done(dev->vid); + + return 0; +} + +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + +static int +vhost_user_set_req_fd(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Invalid file descriptor for slave channel (%d)\n", + fd); + return -1; + } + + dev->slave_req_fd = fd; + + return 0; +} + +static int +is_vring_iotlb_update(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) +{ + struct vhost_vring_addr *ra; + uint64_t start, end; + + start = imsg->iova; + end = start + imsg->size; + + ra = &vq->ring_addrs; + if (ra->desc_user_addr >= start && ra->desc_user_addr < end) + return 1; + if (ra->avail_user_addr >= start && ra->avail_user_addr < end) + return 1; + if (ra->used_user_addr >= start && ra->used_user_addr < end) + return 1; + + return 0; +} + +static int +is_vring_iotlb_invalidate(struct vhost_virtqueue *vq, + struct vhost_iotlb_msg *imsg) +{ + uint64_t istart, iend, vstart, vend; + + istart = imsg->iova; + iend = istart + imsg->size - 1; + + vstart = (uintptr_t)vq->desc; + vend = vstart + sizeof(struct vring_desc) * vq->size - 1; + if (vstart <= iend && istart <= vend) + return 1; + + vstart = (uintptr_t)vq->avail; + vend = vstart + sizeof(struct vring_avail); + vend += sizeof(uint16_t) * vq->size - 1; + if (vstart <= iend && istart <= vend) + return 1; + + vstart = (uintptr_t)vq->used; + vend = vstart + sizeof(struct vring_used); + vend += sizeof(struct vring_used_elem) * vq->size - 1; + if (vstart <= iend && istart <= vend) + return 1; + + return 0; +} + +static int +vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) +{ + struct virtio_net *dev = *pdev; + struct vhost_iotlb_msg *imsg = &msg->payload.iotlb; + uint16_t i; + uint64_t vva, len; + + switch (imsg->type) { + case VHOST_IOTLB_UPDATE: + len = imsg->size; + vva = qva_to_vva(dev, imsg->uaddr, &len); + if (!vva) + return -1; + + for (i = 0; i < dev->nr_vring; i++) { + struct vhost_virtqueue *vq = dev->virtqueue[i]; + + vhost_user_iotlb_cache_insert(vq, imsg->iova, vva, + len, imsg->perm); + + if (is_vring_iotlb_update(vq, imsg)) + *pdev = dev = translate_ring_addresses(dev, i); + } + break; + case VHOST_IOTLB_INVALIDATE: + for (i = 0; i < dev->nr_vring; i++) { + struct vhost_virtqueue *vq = dev->virtqueue[i]; + + vhost_user_iotlb_cache_remove(vq, imsg->iova, + imsg->size); + + if (is_vring_iotlb_invalidate(vq, imsg)) + vring_invalidate(dev, vq); + } + break; + default: + RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n", + imsg->type); + return -1; + } + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg, int *fds, int fd_num) +{ + if (!msg) + return 0; + + return send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, fds, fd_num); +} + +static int +send_vhost_reply(int sockfd, struct VhostUserMsg *msg) +{ + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + return send_vhost_message(sockfd, msg, NULL, 0); +} + +static int +send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg, + int *fds, int fd_num) +{ + int ret; + + if (msg->flags & VHOST_USER_NEED_REPLY) + rte_spinlock_lock(&dev->slave_req_lock); + + ret = send_vhost_message(dev->slave_req_fd, msg, fds, fd_num); + if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY)) + rte_spinlock_unlock(&dev->slave_req_lock); + + return ret; +} + +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request.master) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + +static void +vhost_user_lock_all_queue_pairs(struct virtio_net *dev) +{ + unsigned int i = 0; + unsigned int vq_num = 0; + + while (vq_num < dev->nr_vring) { + struct vhost_virtqueue *vq = dev->virtqueue[i]; + + if (vq) { + rte_spinlock_lock(&vq->access_lock); + vq_num++; + } + i++; + } +} + +static void +vhost_user_unlock_all_queue_pairs(struct virtio_net *dev) +{ + unsigned int i = 0; + unsigned int vq_num = 0; + + while (vq_num < dev->nr_vring) { + struct vhost_virtqueue *vq = dev->virtqueue[i]; + + if (vq) { + rte_spinlock_unlock(&vq->access_lock); + vq_num++; + } + i++; + } +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret; + int unlock_required = 0; + uint32_t skip_master = 0; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + if (!dev->notify_ops) { + dev->notify_ops = vhost_driver_callback_get(dev->ifname); + if (!dev->notify_ops) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get callback ops for driver %s\n", + dev->ifname); + return -1; + } + } + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request.master >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + ret = 0; + if (msg.request.master != VHOST_USER_IOTLB_MSG) + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request.master]); + else + RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request.master]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + + /* + * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE + * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops + * and device is destroyed. destroy_device waits for queues to be + * inactive, so it is safe. Otherwise taking the access_lock + * would cause a dead lock. + */ + switch (msg.request.master) { + case VHOST_USER_SET_FEATURES: + case VHOST_USER_SET_PROTOCOL_FEATURES: + case VHOST_USER_SET_OWNER: + case VHOST_USER_SET_MEM_TABLE: + case VHOST_USER_SET_LOG_BASE: + case VHOST_USER_SET_LOG_FD: + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_ADDR: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + case VHOST_USER_SET_VRING_ENABLE: + case VHOST_USER_SEND_RARP: + case VHOST_USER_NET_SET_MTU: + case VHOST_USER_SET_SLAVE_REQ_FD: + vhost_user_lock_all_queue_pairs(dev); + unlock_required = 1; + break; + default: + break; + + } + + if (dev->extern_ops.pre_msg_handle) { + uint32_t need_reply; + + ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, + (void *)&msg, &need_reply, &skip_master); + if (ret < 0) + goto skip_to_reply; + + if (need_reply) + send_vhost_reply(fd, &msg); + + if (skip_master) + goto skip_to_post_handle; + } + + switch (msg.request.master) { + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(dev); + msg.size = sizeof(msg.payload.u64); + send_vhost_reply(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + ret = vhost_user_set_features(dev, msg.payload.u64); + if (ret) + return -1; + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + vhost_user_get_protocol_features(dev, &msg); + send_vhost_reply(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + ret = vhost_user_set_mem_table(&dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_reply(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(&dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg); + break; + + case VHOST_USER_GET_VRING_BASE: + vhost_user_get_vring_base(dev, &msg); + msg.size = sizeof(msg.payload.state); + send_vhost_reply(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(&dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = (uint64_t)vhost_user_get_queue_num(dev); + msg.size = sizeof(msg.payload.u64); + send_vhost_reply(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + + case VHOST_USER_SET_SLAVE_REQ_FD: + ret = vhost_user_set_req_fd(dev, &msg); + break; + + case VHOST_USER_IOTLB_MSG: + ret = vhost_user_iotlb_msg(&dev, &msg); + break; + + default: + ret = -1; + break; + } + +skip_to_post_handle: + if (dev->extern_ops.post_msg_handle) { + uint32_t need_reply; + + ret = (*dev->extern_ops.post_msg_handle)( + dev->vid, (void *)&msg, &need_reply); + if (ret < 0) + goto skip_to_reply; + + if (need_reply) + send_vhost_reply(fd, &msg); + } + +skip_to_reply: + if (unlock_required) + vhost_user_unlock_all_queue_pairs(dev); + + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_reply(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && virtio_is_ready(dev) && + !(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) && + msg.request.master == VHOST_USER_SET_VRING_ENABLE) { + if (vdpa_dev->ops->dev_conf) + vdpa_dev->ops->dev_conf(dev->vid); + dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; + if (vhost_user_host_notifier_ctrl(dev->vid, true) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "(%d) software relay is used for vDPA, performance may be low.\n", + dev->vid); + } + } + + return 0; +} + +static int process_slave_message_reply(struct virtio_net *dev, + const VhostUserMsg *msg) +{ + VhostUserMsg msg_reply; + int ret; + + if ((msg->flags & VHOST_USER_NEED_REPLY) == 0) + return 0; + + if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) { + ret = -1; + goto out; + } + + if (msg_reply.request.slave != msg->request.slave) { + RTE_LOG(ERR, VHOST_CONFIG, + "Received unexpected msg type (%u), expected %u\n", + msg_reply.request.slave, msg->request.slave); + ret = -1; + goto out; + } + + ret = msg_reply.payload.u64 ? -1 : 0; + +out: + rte_spinlock_unlock(&dev->slave_req_lock); + return ret; +} + +int +vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) +{ + int ret; + struct VhostUserMsg msg = { + .request.slave = VHOST_USER_SLAVE_IOTLB_MSG, + .flags = VHOST_USER_VERSION, + .size = sizeof(msg.payload.iotlb), + .payload.iotlb = { + .iova = iova, + .perm = perm, + .type = VHOST_IOTLB_MISS, + }, + }; + + ret = send_vhost_message(dev->slave_req_fd, &msg, NULL, 0); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to send IOTLB miss message (%d)\n", + ret); + return ret; + } + + return 0; +} + +static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, + int index, int fd, + uint64_t offset, + uint64_t size) +{ + int *fdp = NULL; + size_t fd_num = 0; + int ret; + struct VhostUserMsg msg = { + .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, + .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, + .size = sizeof(msg.payload.area), + .payload.area = { + .u64 = index & VHOST_USER_VRING_IDX_MASK, + .size = size, + .offset = offset, + }, + }; + + if (fd < 0) + msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; + else { + fdp = &fd; + fd_num = 1; + } + + ret = send_vhost_slave_message(dev, &msg, fdp, fd_num); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to set host notifier (%d)\n", ret); + return ret; + } + + return process_slave_message_reply(dev, &msg); +} + +int vhost_user_host_notifier_ctrl(int vid, bool enable) +{ + struct virtio_net *dev; + struct rte_vdpa_device *vdpa_dev; + int vfio_device_fd, did, ret = 0; + uint64_t offset, size; + unsigned int i; + + dev = get_device(vid); + if (!dev) + return -ENODEV; + + did = dev->vdpa_dev_id; + if (did < 0) + return -EINVAL; + + if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || + !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) + return -ENOTSUP; + + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev) + return -ENODEV; + + RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); + + vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); + if (vfio_device_fd < 0) + return -ENOTSUP; + + if (enable) { + for (i = 0; i < dev->nr_vring; i++) { + if (vdpa_dev->ops->get_notify_area(vid, i, &offset, + &size) < 0) { + ret = -ENOTSUP; + goto disable; + } + + if (vhost_user_slave_set_vring_host_notifier(dev, i, + vfio_device_fd, offset, size) < 0) { + ret = -EFAULT; + goto disable; + } + } + } else { +disable: + for (i = 0; i < dev->nr_vring; i++) { + vhost_user_slave_set_vring_host_notifier(dev, i, -1, + 0, 0); + } + } + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_vhost/vhost_user.h b/src/spdk/dpdk/lib/librte_vhost/vhost_user.h new file mode 100644 index 00000000..42166adf --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/vhost_user.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include +#include + +#include "rte_vhost.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \ + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \ + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \ + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_SLAVE_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_CRYPTO_CREATE_SESS = 26, + VHOST_USER_CRYPTO_CLOSE_SESS = 27, + VHOST_USER_MAX = 28 +} VhostUserRequest; + +typedef enum VhostUserSlaveRequest { + VHOST_USER_SLAVE_NONE = 0, + VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_MAX +} VhostUserSlaveRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +/* Comply with Cryptodev-Linux */ +#define VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH 512 +#define VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH 64 + +/* Same structure as vhost-user backend session info */ +typedef struct VhostUserCryptoSessionParam { + int64_t session_id; + uint32_t op_code; + uint32_t cipher_algo; + uint32_t cipher_key_len; + uint32_t hash_algo; + uint32_t digest_len; + uint32_t auth_key_len; + uint32_t aad_len; + uint8_t op_type; + uint8_t dir; + uint8_t hash_mode; + uint8_t chaining_dir; + uint8_t *ciphe_key; + uint8_t *auth_key; + uint8_t cipher_key_buf[VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH]; + uint8_t auth_key_buf[VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH]; +} VhostUserCryptoSessionParam; + +typedef struct VhostUserVringArea { + uint64_t u64; + uint64_t size; + uint64_t offset; +} VhostUserVringArea; + +typedef struct VhostUserMsg { + union { + uint32_t master; /* a VhostUserRequest value */ + uint32_t slave; /* a VhostUserSlaveRequest value*/ + } request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + struct vhost_iotlb_msg iotlb; + VhostUserCryptoSessionParam crypto_session; + VhostUserVringArea area; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); +int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm); +int vhost_user_host_notifier_ctrl(int vid, bool enable); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/src/spdk/dpdk/lib/librte_vhost/virtio_crypto.h b/src/spdk/dpdk/lib/librte_vhost/virtio_crypto.h new file mode 100644 index 00000000..e3b93573 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/virtio_crypto.h @@ -0,0 +1,422 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD. + */ + +#ifndef _VIRTIO_CRYPTO_H +#define _VIRTIO_CRYPTO_H + +#define VIRTIO_CRYPTO_SERVICE_CIPHER 0 +#define VIRTIO_CRYPTO_SERVICE_HASH 1 +#define VIRTIO_CRYPTO_SERVICE_MAC 2 +#define VIRTIO_CRYPTO_SERVICE_AEAD 3 + +#define VIRTIO_CRYPTO_OPCODE(service, op) (((service) << 8) | (op)) + +struct virtio_crypto_ctrl_header { +#define VIRTIO_CRYPTO_CIPHER_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x02) +#define VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x03) +#define VIRTIO_CRYPTO_HASH_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x02) +#define VIRTIO_CRYPTO_HASH_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x03) +#define VIRTIO_CRYPTO_MAC_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x02) +#define VIRTIO_CRYPTO_MAC_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x03) +#define VIRTIO_CRYPTO_AEAD_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x02) +#define VIRTIO_CRYPTO_AEAD_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x03) + uint32_t opcode; + uint32_t algo; + uint32_t flag; + /* data virtqueue id */ + uint32_t queue_id; +}; + +struct virtio_crypto_cipher_session_para { +#define VIRTIO_CRYPTO_NO_CIPHER 0 +#define VIRTIO_CRYPTO_CIPHER_ARC4 1 +#define VIRTIO_CRYPTO_CIPHER_AES_ECB 2 +#define VIRTIO_CRYPTO_CIPHER_AES_CBC 3 +#define VIRTIO_CRYPTO_CIPHER_AES_CTR 4 +#define VIRTIO_CRYPTO_CIPHER_DES_ECB 5 +#define VIRTIO_CRYPTO_CIPHER_DES_CBC 6 +#define VIRTIO_CRYPTO_CIPHER_3DES_ECB 7 +#define VIRTIO_CRYPTO_CIPHER_3DES_CBC 8 +#define VIRTIO_CRYPTO_CIPHER_3DES_CTR 9 +#define VIRTIO_CRYPTO_CIPHER_KASUMI_F8 10 +#define VIRTIO_CRYPTO_CIPHER_SNOW3G_UEA2 11 +#define VIRTIO_CRYPTO_CIPHER_AES_F8 12 +#define VIRTIO_CRYPTO_CIPHER_AES_XTS 13 +#define VIRTIO_CRYPTO_CIPHER_ZUC_EEA3 14 + uint32_t algo; + /* length of key */ + uint32_t keylen; + +#define VIRTIO_CRYPTO_OP_ENCRYPT 1 +#define VIRTIO_CRYPTO_OP_DECRYPT 2 + /* encrypt or decrypt */ + uint32_t op; + uint32_t padding; +}; + +struct virtio_crypto_session_input { + /* Device-writable part */ + uint64_t session_id; + uint32_t status; + uint32_t padding; +}; + +struct virtio_crypto_cipher_session_req { + struct virtio_crypto_cipher_session_para para; + uint8_t padding[32]; +}; + +struct virtio_crypto_hash_session_para { +#define VIRTIO_CRYPTO_NO_HASH 0 +#define VIRTIO_CRYPTO_HASH_MD5 1 +#define VIRTIO_CRYPTO_HASH_SHA1 2 +#define VIRTIO_CRYPTO_HASH_SHA_224 3 +#define VIRTIO_CRYPTO_HASH_SHA_256 4 +#define VIRTIO_CRYPTO_HASH_SHA_384 5 +#define VIRTIO_CRYPTO_HASH_SHA_512 6 +#define VIRTIO_CRYPTO_HASH_SHA3_224 7 +#define VIRTIO_CRYPTO_HASH_SHA3_256 8 +#define VIRTIO_CRYPTO_HASH_SHA3_384 9 +#define VIRTIO_CRYPTO_HASH_SHA3_512 10 +#define VIRTIO_CRYPTO_HASH_SHA3_SHAKE128 11 +#define VIRTIO_CRYPTO_HASH_SHA3_SHAKE256 12 + uint32_t algo; + /* hash result length */ + uint32_t hash_result_len; + uint8_t padding[8]; +}; + +struct virtio_crypto_hash_create_session_req { + struct virtio_crypto_hash_session_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_mac_session_para { +#define VIRTIO_CRYPTO_NO_MAC 0 +#define VIRTIO_CRYPTO_MAC_HMAC_MD5 1 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA1 2 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_224 3 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_256 4 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_384 5 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_512 6 +#define VIRTIO_CRYPTO_MAC_CMAC_3DES 25 +#define VIRTIO_CRYPTO_MAC_CMAC_AES 26 +#define VIRTIO_CRYPTO_MAC_KASUMI_F9 27 +#define VIRTIO_CRYPTO_MAC_SNOW3G_UIA2 28 +#define VIRTIO_CRYPTO_MAC_GMAC_AES 41 +#define VIRTIO_CRYPTO_MAC_GMAC_TWOFISH 42 +#define VIRTIO_CRYPTO_MAC_CBCMAC_AES 49 +#define VIRTIO_CRYPTO_MAC_CBCMAC_KASUMI_F9 50 +#define VIRTIO_CRYPTO_MAC_XCBC_AES 53 + uint32_t algo; + /* hash result length */ + uint32_t hash_result_len; + /* length of authenticated key */ + uint32_t auth_key_len; + uint32_t padding; +}; + +struct virtio_crypto_mac_create_session_req { + struct virtio_crypto_mac_session_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_aead_session_para { +#define VIRTIO_CRYPTO_NO_AEAD 0 +#define VIRTIO_CRYPTO_AEAD_GCM 1 +#define VIRTIO_CRYPTO_AEAD_CCM 2 +#define VIRTIO_CRYPTO_AEAD_CHACHA20_POLY1305 3 + uint32_t algo; + /* length of key */ + uint32_t key_len; + /* hash result length */ + uint32_t hash_result_len; + /* length of the additional authenticated data (AAD) in bytes */ + uint32_t aad_len; + /* encrypt or decrypt, See above VIRTIO_CRYPTO_OP_* */ + uint32_t op; + uint32_t padding; +}; + +struct virtio_crypto_aead_create_session_req { + struct virtio_crypto_aead_session_para para; + uint8_t padding[32]; +}; + +struct virtio_crypto_alg_chain_session_para { +#define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER 1 +#define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH 2 + uint32_t alg_chain_order; +/* Plain hash */ +#define VIRTIO_CRYPTO_SYM_HASH_MODE_PLAIN 1 +/* Authenticated hash (mac) */ +#define VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH 2 +/* Nested hash */ +#define VIRTIO_CRYPTO_SYM_HASH_MODE_NESTED 3 + uint32_t hash_mode; + struct virtio_crypto_cipher_session_para cipher_param; + union { + struct virtio_crypto_hash_session_para hash_param; + struct virtio_crypto_mac_session_para mac_param; + uint8_t padding[16]; + } u; + /* length of the additional authenticated data (AAD) in bytes */ + uint32_t aad_len; + uint32_t padding; +}; + +struct virtio_crypto_alg_chain_session_req { + struct virtio_crypto_alg_chain_session_para para; +}; + +struct virtio_crypto_sym_create_session_req { + union { + struct virtio_crypto_cipher_session_req cipher; + struct virtio_crypto_alg_chain_session_req chain; + uint8_t padding[48]; + } u; + + /* Device-readable part */ + +/* No operation */ +#define VIRTIO_CRYPTO_SYM_OP_NONE 0 +/* Cipher only operation on the data */ +#define VIRTIO_CRYPTO_SYM_OP_CIPHER 1 +/* + * Chain any cipher with any hash or mac operation. The order + * depends on the value of alg_chain_order param + */ +#define VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING 2 + uint32_t op_type; + uint32_t padding; +}; + +struct virtio_crypto_destroy_session_req { + /* Device-readable part */ + uint64_t session_id; + uint8_t padding[48]; +}; + +/* The request of the control virtqueue's packet */ +struct virtio_crypto_op_ctrl_req { + struct virtio_crypto_ctrl_header header; + + union { + struct virtio_crypto_sym_create_session_req + sym_create_session; + struct virtio_crypto_hash_create_session_req + hash_create_session; + struct virtio_crypto_mac_create_session_req + mac_create_session; + struct virtio_crypto_aead_create_session_req + aead_create_session; + struct virtio_crypto_destroy_session_req + destroy_session; + uint8_t padding[56]; + } u; +}; + +struct virtio_crypto_op_header { +#define VIRTIO_CRYPTO_CIPHER_ENCRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x00) +#define VIRTIO_CRYPTO_CIPHER_DECRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x01) +#define VIRTIO_CRYPTO_HASH \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x00) +#define VIRTIO_CRYPTO_MAC \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x00) +#define VIRTIO_CRYPTO_AEAD_ENCRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x00) +#define VIRTIO_CRYPTO_AEAD_DECRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x01) + uint32_t opcode; + /* algo should be service-specific algorithms */ + uint32_t algo; + /* session_id should be service-specific algorithms */ + uint64_t session_id; + /* control flag to control the request */ + uint32_t flag; + uint32_t padding; +}; + +struct virtio_crypto_cipher_para { + /* + * Byte Length of valid IV/Counter + * + * For block ciphers in CBC or F8 mode, or for Kasumi in F8 mode, or for + * SNOW3G in UEA2 mode, this is the length of the IV (which + * must be the same as the block length of the cipher). + * For block ciphers in CTR mode, this is the length of the counter + * (which must be the same as the block length of the cipher). + * For AES-XTS, this is the 128bit tweak, i, from IEEE Std 1619-2007. + * + * The IV/Counter will be updated after every partial cryptographic + * operation. + */ + uint32_t iv_len; + /* length of source data */ + uint32_t src_data_len; + /* length of dst data */ + uint32_t dst_data_len; + uint32_t padding; +}; + +struct virtio_crypto_hash_para { + /* length of source data */ + uint32_t src_data_len; + /* hash result length */ + uint32_t hash_result_len; +}; + +struct virtio_crypto_mac_para { + struct virtio_crypto_hash_para hash; +}; + +struct virtio_crypto_aead_para { + /* + * Byte Length of valid IV data pointed to by the below iv_addr + * parameter. + * + * For GCM mode, this is either 12 (for 96-bit IVs) or 16, in which + * case iv_addr points to J0. + * For CCM mode, this is the length of the nonce, which can be in the + * range 7 to 13 inclusive. + */ + uint32_t iv_len; + /* length of additional auth data */ + uint32_t aad_len; + /* length of source data */ + uint32_t src_data_len; + /* length of dst data */ + uint32_t dst_data_len; +}; + +struct virtio_crypto_cipher_data_req { + /* Device-readable part */ + struct virtio_crypto_cipher_para para; + uint8_t padding[24]; +}; + +struct virtio_crypto_hash_data_req { + /* Device-readable part */ + struct virtio_crypto_hash_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_mac_data_req { + /* Device-readable part */ + struct virtio_crypto_mac_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_alg_chain_data_para { + uint32_t iv_len; + /* Length of source data */ + uint32_t src_data_len; + /* Length of destination data */ + uint32_t dst_data_len; + /* Starting point for cipher processing in source data */ + uint32_t cipher_start_src_offset; + /* Length of the source data that the cipher will be computed on */ + uint32_t len_to_cipher; + /* Starting point for hash processing in source data */ + uint32_t hash_start_src_offset; + /* Length of the source data that the hash will be computed on */ + uint32_t len_to_hash; + /* Length of the additional auth data */ + uint32_t aad_len; + /* Length of the hash result */ + uint32_t hash_result_len; + uint32_t reserved; +}; + +struct virtio_crypto_alg_chain_data_req { + /* Device-readable part */ + struct virtio_crypto_alg_chain_data_para para; +}; + +struct virtio_crypto_sym_data_req { + union { + struct virtio_crypto_cipher_data_req cipher; + struct virtio_crypto_alg_chain_data_req chain; + uint8_t padding[40]; + } u; + + /* See above VIRTIO_CRYPTO_SYM_OP_* */ + uint32_t op_type; + uint32_t padding; +}; + +struct virtio_crypto_aead_data_req { + /* Device-readable part */ + struct virtio_crypto_aead_para para; + uint8_t padding[32]; +}; + +/* The request of the data virtqueue's packet */ +struct virtio_crypto_op_data_req { + struct virtio_crypto_op_header header; + + union { + struct virtio_crypto_sym_data_req sym_req; + struct virtio_crypto_hash_data_req hash_req; + struct virtio_crypto_mac_data_req mac_req; + struct virtio_crypto_aead_data_req aead_req; + uint8_t padding[48]; + } u; +}; + +#define VIRTIO_CRYPTO_OK 0 +#define VIRTIO_CRYPTO_ERR 1 +#define VIRTIO_CRYPTO_BADMSG 2 +#define VIRTIO_CRYPTO_NOTSUPP 3 +#define VIRTIO_CRYPTO_INVSESS 4 /* Invalid session id */ + +/* The accelerator hardware is ready */ +#define VIRTIO_CRYPTO_S_HW_READY (1 << 0) + +struct virtio_crypto_config { + /* See VIRTIO_CRYPTO_OP_* above */ + uint32_t status; + + /* + * Maximum number of data queue + */ + uint32_t max_dataqueues; + + /* + * Specifies the services mask which the device support, + * see VIRTIO_CRYPTO_SERVICE_* above + */ + uint32_t crypto_services; + + /* Detailed algorithms mask */ + uint32_t cipher_algo_l; + uint32_t cipher_algo_h; + uint32_t hash_algo; + uint32_t mac_algo_l; + uint32_t mac_algo_h; + uint32_t aead_algo; + /* Maximum length of cipher key */ + uint32_t max_cipher_key_len; + /* Maximum length of authenticated key */ + uint32_t max_auth_key_len; + uint32_t reserve; + /* Maximum size of each crypto request's content */ + uint64_t max_size; +}; + +struct virtio_crypto_inhdr { + /* See VIRTIO_CRYPTO_* above */ + uint8_t status; +}; +#endif /* _VIRTIO_CRYPTO_H */ diff --git a/src/spdk/dpdk/lib/librte_vhost/virtio_net.c b/src/spdk/dpdk/lib/librte_vhost/virtio_net.c new file mode 100644 index 00000000..99c7afc8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_vhost/virtio_net.c @@ -0,0 +1,1657 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2016 Intel Corporation + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iotlb.h" +#include "vhost.h" + +#define MAX_PKT_BURST 32 + +#define MAX_BATCH_LEN 256 + +static __rte_always_inline bool +rxvq_is_mergeable(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); +} + +static bool +is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) +{ + return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; +} + +static __rte_always_inline void * +alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t desc_addr, uint64_t desc_len) +{ + void *idesc; + uint64_t src, dst; + uint64_t len, remain = desc_len; + + idesc = rte_malloc(__func__, desc_len, 0); + if (unlikely(!idesc)) + return 0; + + dst = (uint64_t)(uintptr_t)idesc; + + while (remain) { + len = remain; + src = vhost_iova_to_vva(dev, vq, desc_addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + rte_free(idesc); + return 0; + } + + rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + desc_addr += len; + } + + return idesc; +} + +static __rte_always_inline void +free_ind_table(void *idesc) +{ + rte_free(idesc); +} + +static __rte_always_inline void +do_flush_shadow_used_ring_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t to, uint16_t from, uint16_t size) +{ + rte_memcpy(&vq->used->ring[to], + &vq->shadow_used_split[from], + size * sizeof(struct vring_used_elem)); + vhost_log_cache_used_vring(dev, vq, + offsetof(struct vring_used, ring[to]), + size * sizeof(struct vring_used_elem)); +} + +static __rte_always_inline void +flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t used_idx = vq->last_used_idx & (vq->size - 1); + + if (used_idx + vq->shadow_used_idx <= vq->size) { + do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, + vq->shadow_used_idx); + } else { + uint16_t size; + + /* update used ring interval [used_idx, vq->size] */ + size = vq->size - used_idx; + do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size); + + /* update the left half used ring interval [0, left_size] */ + do_flush_shadow_used_ring_split(dev, vq, 0, size, + vq->shadow_used_idx - size); + } + vq->last_used_idx += vq->shadow_used_idx; + + rte_smp_wmb(); + + vhost_log_cache_sync(dev, vq); + + *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + vq->shadow_used_idx = 0; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); +} + +static __rte_always_inline void +update_shadow_used_ring_split(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_split[i].id = desc_idx; + vq->shadow_used_split[i].len = len; +} + +static __rte_always_inline void +flush_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->last_used_idx += vq->shadow_used_packed[i].count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } + + rte_smp_wmb(); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + +static __rte_always_inline void +update_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len, uint16_t count) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; +} + +static inline void +do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + struct batch_copy_elem *elem = vq->batch_copy_elems; + uint16_t count = vq->batch_copy_nb_elems; + int i; + + for (i = 0; i < count; i++) { + rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len); + PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); + } + + vq->batch_copy_nb_elems = 0; +} + +static inline void +do_data_copy_dequeue(struct vhost_virtqueue *vq) +{ + struct batch_copy_elem *elem = vq->batch_copy_elems; + uint16_t count = vq->batch_copy_nb_elems; + int i; + + for (i = 0; i < count; i++) + rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + + vq->batch_copy_nb_elems = 0; +} + +/* avoid write operation when necessary, to lessen cache issues */ +#define ASSIGN_UNLESS_EQUAL(var, val) do { \ + if ((var) != (val)) \ + (var) = (val); \ +} while (0) + +static __rte_always_inline void +virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) +{ + uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK; + + if (m_buf->ol_flags & PKT_TX_TCP_SEG) + csum_l4 |= PKT_TX_TCP_CKSUM; + + if (csum_l4) { + net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len; + + switch (csum_l4) { + case PKT_TX_TCP_CKSUM: + net_hdr->csum_offset = (offsetof(struct tcp_hdr, + cksum)); + break; + case PKT_TX_UDP_CKSUM: + net_hdr->csum_offset = (offsetof(struct udp_hdr, + dgram_cksum)); + break; + case PKT_TX_SCTP_CKSUM: + net_hdr->csum_offset = (offsetof(struct sctp_hdr, + cksum)); + break; + } + } else { + ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0); + } + + /* IP cksum verification cannot be bypassed, then calculate here */ + if (m_buf->ol_flags & PKT_TX_IP_CKSUM) { + struct ipv4_hdr *ipv4_hdr; + + ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *, + m_buf->l2_len); + ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr); + } + + if (m_buf->ol_flags & PKT_TX_TCP_SEG) { + if (m_buf->ol_flags & PKT_TX_IPV4) + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + net_hdr->gso_size = m_buf->tso_segsz; + net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len + + m_buf->l4_len; + } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) { + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; + net_hdr->gso_size = m_buf->tso_segsz; + net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len + + m_buf->l4_len; + } else { + ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0); + } +} + +static __rte_always_inline int +map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct buf_vector *buf_vec, uint16_t *vec_idx, + uint64_t desc_iova, uint64_t desc_len, uint8_t perm) +{ + uint16_t vec_id = *vec_idx; + + while (desc_len) { + uint64_t desc_addr; + uint64_t desc_chunck_len = desc_len; + + if (unlikely(vec_id >= BUF_VECTOR_MAX)) + return -1; + + desc_addr = vhost_iova_to_vva(dev, vq, + desc_iova, + &desc_chunck_len, + perm); + if (unlikely(!desc_addr)) + return -1; + + buf_vec[vec_id].buf_iova = desc_iova; + buf_vec[vec_id].buf_addr = desc_addr; + buf_vec[vec_id].buf_len = desc_chunck_len; + + desc_len -= desc_chunck_len; + desc_iova += desc_chunck_len; + vec_id++; + } + *vec_idx = vec_id; + + return 0; +} + +static __rte_always_inline int +fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t avail_idx, uint16_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *desc_chain_head, + uint16_t *desc_chain_len, uint8_t perm) +{ + uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; + uint16_t vec_id = *vec_idx; + uint32_t len = 0; + uint64_t dlen; + struct vring_desc *descs = vq->desc; + struct vring_desc *idesc = NULL; + + *desc_chain_head = idx; + + if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { + dlen = vq->desc[idx].len; + descs = (struct vring_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->desc[idx].addr, + &dlen, + VHOST_ACCESS_RO); + if (unlikely(!descs)) + return -1; + + if (unlikely(dlen < vq->desc[idx].len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idesc = alloc_copy_ind_table(dev, vq, + vq->desc[idx].addr, vq->desc[idx].len); + if (unlikely(!idesc)) + return -1; + + descs = idesc; + } + + idx = 0; + } + + while (1) { + if (unlikely(idx >= vq->size)) { + free_ind_table(idesc); + return -1; + } + + len += descs[idx].len; + + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[idx].addr, descs[idx].len, + perm))) { + free_ind_table(idesc); + return -1; + } + + if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + idx = descs[idx].next; + } + + *desc_chain_len = len; + *vec_idx = vec_id; + + if (unlikely(!!idesc)) + free_ind_table(idesc); + + return 0; +} + +/* + * Returns -1 on fail, 0 on success + */ +static inline int +reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *num_buffers, uint16_t avail_head, + uint16_t *nr_vec) +{ + uint16_t cur_idx; + uint16_t vec_idx = 0; + uint16_t max_tries, tries = 0; + + uint16_t head_idx = 0; + uint16_t len = 0; + + *num_buffers = 0; + cur_idx = vq->last_avail_idx; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + if (unlikely(cur_idx == avail_head)) + return -1; + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_split(dev, vq, cur_idx, + &vec_idx, buf_vec, + &head_idx, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + len = RTE_MIN(len, size); + update_shadow_used_ring_split(vq, head_idx, len); + size -= len; + + cur_idx++; + *num_buffers += 1; + } + + *nr_vec = vec_idx; + + return 0; +} + +static __rte_always_inline int +fill_vec_buf_packed_indirect(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct vring_packed_desc *desc, uint16_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *len, uint8_t perm) +{ + uint16_t i; + uint32_t nr_descs; + uint16_t vec_id = *vec_idx; + uint64_t dlen; + struct vring_packed_desc *descs, *idescs = NULL; + + dlen = desc->len; + descs = (struct vring_packed_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO); + if (unlikely(!descs)) + return -1; + + if (unlikely(dlen < desc->len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len); + if (unlikely(!idescs)) + return -1; + + descs = idescs; + } + + nr_descs = desc->len / sizeof(struct vring_packed_desc); + if (unlikely(nr_descs >= vq->size)) { + free_ind_table(idescs); + return -1; + } + + for (i = 0; i < nr_descs; i++) { + if (unlikely(vec_id >= BUF_VECTOR_MAX)) { + free_ind_table(idescs); + return -1; + } + + *len += descs[i].len; + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[i].addr, descs[i].len, + perm))) + return -1; + } + *vec_idx = vec_id; + + if (unlikely(!!idescs)) + free_ind_table(idescs); + + return 0; +} + +static __rte_always_inline int +fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t avail_idx, uint16_t *desc_count, + struct buf_vector *buf_vec, uint16_t *vec_idx, + uint16_t *buf_id, uint16_t *len, uint8_t perm) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t vec_id = *vec_idx; + + if (avail_idx < vq->last_avail_idx) + wrap_counter ^= 1; + + if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter))) + return -1; + + *desc_count = 0; + + while (1) { + if (unlikely(vec_id >= BUF_VECTOR_MAX)) + return -1; + + *desc_count += 1; + *buf_id = descs[avail_idx].id; + + if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) { + if (unlikely(fill_vec_buf_packed_indirect(dev, vq, + &descs[avail_idx], + &vec_id, buf_vec, + len, perm) < 0)) + return -1; + } else { + *len += descs[avail_idx].len; + + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[avail_idx].addr, + descs[avail_idx].len, + perm))) + return -1; + } + + if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + if (++avail_idx >= vq->size) { + avail_idx -= vq->size; + wrap_counter ^= 1; + } + } + + *vec_idx = vec_id; + + return 0; +} + +/* + * Returns -1 on fail, 0 on success + */ +static inline int +reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *nr_vec, uint16_t *num_buffers, + uint16_t *nr_descs) +{ + uint16_t avail_idx; + uint16_t vec_idx = 0; + uint16_t max_tries, tries = 0; + + uint16_t buf_id = 0; + uint16_t len = 0; + uint16_t desc_count; + + *num_buffers = 0; + avail_idx = vq->last_avail_idx; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &vec_idx, + &buf_id, &len, + VHOST_ACCESS_RO) < 0)) + return -1; + + len = RTE_MIN(len, size); + update_shadow_used_ring_packed(vq, buf_id, len, desc_count); + size -= len; + + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + + *nr_descs += desc_count; + *num_buffers += 1; + } + + *nr_vec = vec_idx; + + return 0; +} + +static __rte_always_inline int +copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *m, struct buf_vector *buf_vec, + uint16_t nr_vec, uint16_t num_buffers) +{ + uint32_t vec_idx = 0; + uint32_t mbuf_offset, mbuf_avail; + uint32_t buf_offset, buf_avail; + uint64_t buf_addr, buf_iova, buf_len; + uint32_t cpy_len; + uint64_t hdr_addr; + struct rte_mbuf *hdr_mbuf; + struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL; + int error = 0; + + if (unlikely(m == NULL)) { + error = -1; + goto out; + } + + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + if (nr_vec > 1) + rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); + + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { + error = -1; + goto out; + } + + hdr_mbuf = m; + hdr_addr = buf_addr; + if (unlikely(buf_len < dev->vhost_hlen)) + hdr = &tmp_hdr; + else + hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr; + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", + dev->vid, num_buffers); + + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; + } else { + buf_offset = dev->vhost_hlen; + buf_avail = buf_len - dev->vhost_hlen; + } + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current buf, get the next one */ + if (buf_avail == 0) { + vec_idx++; + if (unlikely(vec_idx >= nr_vec)) { + error = -1; + goto out; + } + + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + /* Prefetch next buffer address. */ + if (vec_idx + 1 < nr_vec) + rte_prefetch0((void *)(uintptr_t) + buf_vec[vec_idx + 1].buf_addr); + buf_offset = 0; + buf_avail = buf_len; + } + + /* done with current mbuf, get the next one */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + if (hdr_addr) { + virtio_enqueue_offload(hdr_mbuf, &hdr->hdr); + if (rxvq_is_mergeable(dev)) + ASSIGN_UNLESS_EQUAL(hdr->num_buffers, + num_buffers); + + if (unlikely(hdr == &tmp_hdr)) { + uint64_t len; + uint64_t remain = dev->vhost_hlen; + uint64_t src = (uint64_t)(uintptr_t)hdr, dst; + uint64_t iova = buf_vec[0].buf_iova; + uint16_t hdr_vec_idx = 0; + + while (remain) { + len = RTE_MIN(remain, + buf_vec[hdr_vec_idx].buf_len); + dst = buf_vec[hdr_vec_idx].buf_addr; + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, + len); + + PRINT_PACKET(dev, (uintptr_t)dst, + (uint32_t)len, 0); + vhost_log_cache_write(dev, vq, + iova, len); + + remain -= len; + iova += len; + src += len; + hdr_vec_idx++; + } + } else { + PRINT_PACKET(dev, (uintptr_t)hdr_addr, + dev->vhost_hlen, 0); + vhost_log_cache_write(dev, vq, + buf_vec[0].buf_iova, + dev->vhost_hlen); + } + + hdr_addr = 0; + } + + cpy_len = RTE_MIN(buf_avail, mbuf_avail); + + if (likely(cpy_len > MAX_BATCH_LEN || + vq->batch_copy_nb_elems >= vq->size)) { + rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_cache_write(dev, vq, buf_iova + buf_offset, + cpy_len); + PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset), + cpy_len, 0); + } else { + batch_copy[vq->batch_copy_nb_elems].dst = + (void *)((uintptr_t)(buf_addr + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].src = + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); + batch_copy[vq->batch_copy_nb_elems].log_addr = + buf_iova + buf_offset; + batch_copy[vq->batch_copy_nb_elems].len = cpy_len; + vq->batch_copy_nb_elems++; + } + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; + } + +out: + + return error; +} + +static __rte_always_inline uint32_t +virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t avail_head; + + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + avail_head = *((volatile uint16_t *)&vq->avail->idx); + + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + uint16_t nr_vec = 0; + + if (unlikely(reserve_avail_buf_split(dev, vq, + pkt_len, buf_vec, &num_buffers, + avail_head, &nr_vec) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } + + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); + + if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], + buf_vec, nr_vec, + num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } + + vq->last_avail_idx += num_buffers; + } + + do_data_copy_enqueue(dev, vq); + + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + return pkt_idx; +} + +static __rte_always_inline uint32_t +virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + uint16_t nr_vec = 0; + uint16_t nr_descs = 0; + + if (unlikely(reserve_avail_buf_packed(dev, vq, + pkt_len, buf_vec, &nr_vec, + &num_buffers, &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } + + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); + + if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], + buf_vec, nr_vec, + num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } + + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + } + + do_data_copy_enqueue(dev, vq); + + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } + + return pkt_idx; +} + +static __rte_always_inline uint32_t +virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + rte_spinlock_lock(&vq->access_lock); + + if (unlikely(vq->enabled == 0)) + goto out_access_unlock; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) + goto out; + + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + if (count == 0) + goto out; + + if (vq_is_packed(dev)) + count = virtio_dev_rx_packed(dev, vq, pkts, count); + else + count = virtio_dev_rx_split(dev, vq, pkts, count); + +out: + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_unlock(vq); + +out_access_unlock: + rte_spinlock_unlock(&vq->access_lock); + + return count; +} + +uint16_t +rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return 0; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return 0; + } + + return virtio_dev_rx(dev, queue_id, pkts, count); +} + +static inline bool +virtio_net_with_host_offload(struct virtio_net *dev) +{ + if (dev->features & + ((1ULL << VIRTIO_NET_F_CSUM) | + (1ULL << VIRTIO_NET_F_HOST_ECN) | + (1ULL << VIRTIO_NET_F_HOST_TSO4) | + (1ULL << VIRTIO_NET_F_HOST_TSO6) | + (1ULL << VIRTIO_NET_F_HOST_UFO))) + return true; + + return false; +} + +static void +parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + void *l3_hdr = NULL; + struct ether_hdr *eth_hdr; + uint16_t ethertype; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + m->l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + + m->l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + l3_hdr = (char *)eth_hdr + m->l2_len; + + switch (ethertype) { + case ETHER_TYPE_IPv4: + ipv4_hdr = l3_hdr; + *l4_proto = ipv4_hdr->next_proto_id; + m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV4; + break; + case ETHER_TYPE_IPv6: + ipv6_hdr = l3_hdr; + *l4_proto = ipv6_hdr->proto; + m->l3_len = sizeof(struct ipv6_hdr); + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV6; + break; + default: + m->l3_len = 0; + *l4_proto = 0; + *l4_hdr = NULL; + break; + } +} + +static __rte_always_inline void +vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) +{ + uint16_t l4_proto = 0; + void *l4_hdr = NULL; + struct tcp_hdr *tcp_hdr = NULL; + + if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE) + return; + + parse_ethernet(m, &l4_proto, &l4_hdr); + if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (hdr->csum_start == (m->l2_len + m->l3_len)) { + switch (hdr->csum_offset) { + case (offsetof(struct tcp_hdr, cksum)): + if (l4_proto == IPPROTO_TCP) + m->ol_flags |= PKT_TX_TCP_CKSUM; + break; + case (offsetof(struct udp_hdr, dgram_cksum)): + if (l4_proto == IPPROTO_UDP) + m->ol_flags |= PKT_TX_UDP_CKSUM; + break; + case (offsetof(struct sctp_hdr, cksum)): + if (l4_proto == IPPROTO_SCTP) + m->ol_flags |= PKT_TX_SCTP_CKSUM; + break; + default: + break; + } + } + } + + if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + tcp_hdr = l4_hdr; + m->ol_flags |= PKT_TX_TCP_SEG; + m->tso_segsz = hdr->gso_size; + m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; + break; + case VIRTIO_NET_HDR_GSO_UDP: + m->ol_flags |= PKT_TX_UDP_SEG; + m->tso_segsz = hdr->gso_size; + m->l4_len = sizeof(struct udp_hdr); + break; + default: + RTE_LOG(WARNING, VHOST_DATA, + "unsupported gso type %u.\n", hdr->gso_type); + break; + } + } +} + +static __rte_always_inline void +put_zmbuf(struct zcopy_mbuf *zmbuf) +{ + zmbuf->in_use = 0; +} + +static __rte_always_inline int +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct buf_vector *buf_vec, uint16_t nr_vec, + struct rte_mbuf *m, struct rte_mempool *mbuf_pool) +{ + uint32_t buf_avail, buf_offset; + uint64_t buf_addr, buf_iova, buf_len; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr tmp_hdr; + struct virtio_net_hdr *hdr = NULL; + /* A counter to avoid desc dead loop chain */ + uint16_t vec_idx = 0; + struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + int error = 0; + + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { + error = -1; + goto out; + } + + if (likely(nr_vec > 1)) + rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); + + if (virtio_net_with_host_offload(dev)) { + if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) { + uint64_t len; + uint64_t remain = sizeof(struct virtio_net_hdr); + uint64_t src; + uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr; + uint16_t hdr_vec_idx = 0; + + /* + * No luck, the virtio-net header doesn't fit + * in a contiguous virtual area. + */ + while (remain) { + len = RTE_MIN(remain, + buf_vec[hdr_vec_idx].buf_len); + src = buf_vec[hdr_vec_idx].buf_addr; + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + hdr_vec_idx++; + } + + hdr = &tmp_hdr; + } else { + hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr); + rte_prefetch0(hdr); + } + } + + /* + * A virtio driver normally uses at least 2 desc buffers + * for Tx: the first for storing the header, and others + * for storing the data. + */ + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; + } else if (buf_len == dev->vhost_hlen) { + if (unlikely(++vec_idx >= nr_vec)) + goto out; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + buf_offset = 0; + buf_avail = buf_len; + } else { + buf_offset = dev->vhost_hlen; + buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; + } + + rte_prefetch0((void *)(uintptr_t) + (buf_addr + buf_offset)); + + PRINT_PACKET(dev, + (uintptr_t)(buf_addr + buf_offset), + (uint32_t)buf_avail, 0); + + mbuf_offset = 0; + mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; + while (1) { + uint64_t hpa; + + cpy_len = RTE_MIN(buf_avail, mbuf_avail); + + /* + * A desc buf might across two host physical pages that are + * not continuous. In such case (gpa_to_hpa returns 0), data + * will be copied even though zero copy is enabled. + */ + if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, + buf_iova + buf_offset, cpy_len)))) { + cur->data_len = cpy_len; + cur->data_off = 0; + cur->buf_addr = + (void *)(uintptr_t)(buf_addr + buf_offset); + cur->buf_iova = hpa; + + /* + * In zero copy mode, one mbuf can only reference data + * for one or partial of one desc buff. + */ + mbuf_avail = cpy_len; + } else { + if (likely(cpy_len > MAX_BATCH_LEN || + vq->batch_copy_nb_elems >= vq->size || + (hdr && cur == m))) { + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(buf_addr + + buf_offset)), + cpy_len); + } else { + batch_copy[vq->batch_copy_nb_elems].dst = + rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset); + batch_copy[vq->batch_copy_nb_elems].src = + (void *)((uintptr_t)(buf_addr + + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].len = + cpy_len; + vq->batch_copy_nb_elems++; + } + } + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; + + /* This buf reaches to its end, get the next one */ + if (buf_avail == 0) { + if (++vec_idx >= nr_vec) + break; + + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + /* + * Prefecth desc n + 1 buffer while + * desc n buffer is processed. + */ + if (vec_idx + 1 < nr_vec) + rte_prefetch0((void *)(uintptr_t) + buf_vec[vec_idx + 1].buf_addr); + + buf_offset = 0; + buf_avail = buf_len; + + PRINT_PACKET(dev, (uintptr_t)buf_addr, + (uint32_t)buf_avail, 0); + } + + /* + * This mbuf reaches to its end, get a new one + * to hold more data. + */ + if (mbuf_avail == 0) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to " + "allocate memory for mbuf.\n"); + error = -1; + goto out; + } + if (unlikely(dev->dequeue_zero_copy)) + rte_mbuf_refcnt_update(cur, 1); + + prev->next = cur; + prev->data_len = mbuf_offset; + m->nb_segs += 1; + m->pkt_len += mbuf_offset; + prev = cur; + + mbuf_offset = 0; + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; + } + } + + prev->data_len = mbuf_offset; + m->pkt_len += mbuf_offset; + + if (hdr) + vhost_dequeue_offload(hdr, m); + +out: + + return error; +} + +static __rte_always_inline struct zcopy_mbuf * +get_zmbuf(struct vhost_virtqueue *vq) +{ + uint16_t i; + uint16_t last; + int tries = 0; + + /* search [last_zmbuf_idx, zmbuf_size) */ + i = vq->last_zmbuf_idx; + last = vq->zmbuf_size; + +again: + for (; i < last; i++) { + if (vq->zmbufs[i].in_use == 0) { + vq->last_zmbuf_idx = i + 1; + vq->zmbufs[i].in_use = 1; + return &vq->zmbufs[i]; + } + } + + tries++; + if (tries == 1) { + /* search [0, last_zmbuf_idx) */ + i = 0; + last = vq->last_zmbuf_idx; + goto again; + } + + return NULL; +} + +static __rte_always_inline bool +mbuf_is_consumed(struct rte_mbuf *m) +{ + while (m) { + if (rte_mbuf_refcnt_read(m) > 1) + return false; + m = m->next; + } + + return true; +} + +static __rte_always_inline void +restore_mbuf(struct rte_mbuf *m) +{ + uint32_t mbuf_size, priv_size; + + while (m) { + priv_size = rte_pktmbuf_priv_size(m->pool); + mbuf_size = sizeof(struct rte_mbuf) + priv_size; + /* start of buffer is after mbuf structure and priv data */ + + m->buf_addr = (char *)m + mbuf_size; + m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size; + m = m->next; + } +} + +static __rte_always_inline uint16_t +virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + uint16_t i; + uint16_t free_entries; + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + if (mbuf_is_consumed(zmbuf->mbuf)) { + update_shadow_used_ring_split(vq, + zmbuf->desc_idx, 0); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + + free_entries = *((volatile uint16_t *)&vq->avail->idx) - + vq->last_avail_idx; + if (free_entries == 0) + return 0; + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); + + for (i = 0; i < count; i++) { + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t head_idx, dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_split(dev, vq, + vq->last_avail_idx + i, + &nr_vec, buf_vec, + &head_idx, &dummy_len, + VHOST_ACCESS_RO) < 0)) + break; + + if (likely(dev->dequeue_zero_copy == 0)) + update_shadow_used_ring_split(vq, head_idx, 0); + + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = head_idx; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + } + vq->last_avail_idx += i; + + if (likely(dev->dequeue_zero_copy == 0)) { + do_data_copy_dequeue(vq); + if (unlikely(i < count)) + vq->shadow_used_idx = i; + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + return i; +} + +static __rte_always_inline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + uint16_t i; + + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + if (mbuf_is_consumed(zmbuf->mbuf)) { + update_shadow_used_ring_packed(vq, + zmbuf->desc_idx, + 0, + zmbuf->desc_count); + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + + count = RTE_MIN(count, MAX_PKT_BURST); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); + + for (i = 0; i < count; i++) { + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t buf_id, dummy_len; + uint16_t desc_count, nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &dummy_len, + VHOST_ACCESS_RW) < 0)) + break; + + if (likely(dev->dequeue_zero_copy == 0)) + update_shadow_used_ring_packed(vq, buf_id, 0, + desc_count); + + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = buf_id; + zmbuf->desc_count = desc_count; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + } + + if (likely(dev->dequeue_zero_copy == 0)) { + do_data_copy_dequeue(vq); + if (unlikely(i < count)) + vq->shadow_used_idx = i; + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } + + return i; +} + +uint16_t +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return 0; + } + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0)) + return 0; + + if (unlikely(vq->enabled == 0)) { + count = 0; + goto out_access_unlock; + } + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) { + count = 0; + goto out; + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + * + * broadcast_rarp shares a cacheline in the virtio_net structure + * with some fields that are accessed during enqueue and + * rte_atomic16_cmpset() causes a write if using cmpxchg. This could + * result in false sharing between enqueue and dequeue. + * + * Prevent unnecessary false sharing by reading broadcast_rarp first + * and only performing cmpset if the read indicates it is likely to + * be set. + */ + if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && + rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + + rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac); + if (rarp_mbuf == NULL) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to make RARP packet.\n"); + count = 0; + goto out; + } + count -= 1; + } + + if (vq_is_packed(dev)) + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); + else + count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); + +out: + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_unlock(vq); + +out_access_unlock: + rte_spinlock_unlock(&vq->access_lock); + + if (unlikely(rarp_mbuf != NULL)) { + /* + * Inject it to the head of "pkts" array, so that switch's mac + * learning table will get updated first. + */ + memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *)); + pkts[0] = rarp_mbuf; + count += 1; + } + + return count; +} diff --git a/src/spdk/dpdk/lib/meson.build b/src/spdk/dpdk/lib/meson.build new file mode 100644 index 00000000..eb91f100 --- /dev/null +++ b/src/spdk/dpdk/lib/meson.build @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + + +# process all libraries equally, as far as possible +# "core" libs first, then others alphebetically as far as possible +# NOTE: for speed of meson runs, the dependencies in the subdirectories +# sometimes skip deps that would be implied by others, e.g. if mempool is +# given as a dep, no need to mention ring. This is especially true for the +# core libs which are widely reused, so their deps are kept to a minimum. +libraries = [ 'compat', # just a header, used for versioning + 'kvargs', + 'eal', 'ring', 'mempool', 'mbuf', 'net', 'ethdev', 'pci', # core + 'metrics', # bitrate/latency stats depends on this + 'hash', # efd depends on this + 'timer', # eventdev depends on this + 'acl', 'bbdev', 'bitratestats', 'cfgfile', + 'cmdline', 'compressdev', 'cryptodev', + 'distributor', 'efd', 'eventdev', + 'gro', 'gso', 'ip_frag', 'jobstats', + 'kni', 'latencystats', 'lpm', 'member', + 'meter', 'power', 'pdump', 'rawdev', + 'reorder', 'sched', 'security', 'vhost', + # add pkt framework libs which use other libs from above + 'port', 'table', 'pipeline', + # flow_classify lib depends on pkt framework table lib + 'flow_classify', 'bpf'] + +default_cflags = machine_args +if cc.has_argument('-Wno-format-truncation') + default_cflags += '-Wno-format-truncation' +endif +foreach l:libraries + build = true + name = l + version = 1 + allow_experimental_apis = false + sources = [] + headers = [] + includes = [] + cflags = default_cflags + objs = [] # other object files to link against, used e.g. for + # instruction-set optimized versions of code + + # use "deps" for internal DPDK dependencies, and "ext_deps" for + # external package/library requirements + ext_deps = [] + deps = ['eal'] # eal is standard dependency except for itself + if l == 'kvargs' + deps = [] + endif + if l == 'eal' + deps = ['kvargs'] + endif + + dir_name = 'librte_' + l + subdir(dir_name) + + if build + dpdk_conf.set('RTE_LIBRTE_' + name.to_upper(), 1) + install_headers(headers) + + libname = 'rte_' + name + includes += include_directories(dir_name) + + if sources.length() == 0 + # if no C files, just set a dependency on header path + shared_dep = declare_dependency(include_directories: includes) + static_dep = shared_dep + else + shared_deps = ext_deps + static_deps = ext_deps + foreach d:deps + if not is_variable('shared_rte_' + d) + error('Missing dependency ' + d + + ' for library ' + lib_name) + endif + shared_deps += [get_variable('shared_rte_' + d)] + static_deps += [get_variable('static_rte_' + d)] + endforeach + + if allow_experimental_apis + cflags += '-DALLOW_EXPERIMENTAL_API' + endif + + if get_option('per_library_versions') + lib_version = '@0@.1'.format(version) + so_version = '@0@'.format(version) + else + prj_ver = meson.project_version().split('.') + lib_version = '@0@.@1@'.format( + prj_ver.get(0), prj_ver.get(1)) + so_version = lib_version + endif + + # first build static lib + static_lib = static_library(libname, + sources, + objects: objs, + c_args: cflags, + dependencies: static_deps, + include_directories: includes, + install: true) + static_dep = declare_dependency(link_with: static_lib, + include_directories: includes, + dependencies: static_deps) + + # then use pre-build objects to build shared lib + sources = [] + objs += static_lib.extract_all_objects(recursive: false) + version_map = '@0@/@1@/rte_@2@_version.map'.format( + meson.current_source_dir(), dir_name, name) + shared_lib = shared_library(libname, + sources, + objects: objs, + c_args: cflags, + dependencies: shared_deps, + include_directories: includes, + link_args: '-Wl,--version-script=' + version_map, + link_depends: version_map, + version: lib_version, + soversion: so_version, + install: true) + shared_dep = declare_dependency(link_with: shared_lib, + include_directories: includes, + dependencies: shared_deps) + + dpdk_libraries = [shared_lib] + dpdk_libraries + endif # sources.length() > 0 + + set_variable('shared_' + libname, shared_dep) + set_variable('static_' + libname, static_dep) + endif # if build +endforeach -- cgit v1.2.3